Unverified Commit 9aeacb58 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove...


Adding Fast tokenizers for SentencePiece based tokenizers - Breaking: remove Transfo-XL fast tokenizer (#7141)

* [WIP] SP tokenizers

* fixing tests for T5

* WIP tokenizers

* serialization

* update T5

* WIP T5 tokenization

* slow to fast conversion script

* Refactoring to move tokenzier implementations inside transformers

* Adding gpt - refactoring - quality

* WIP adding several tokenizers to the fast world

* WIP Roberta - moving implementations

* update to dev4 switch file loading to in-memory loading

* Updating and fixing

* advancing on the tokenizers - updating do_lower_case

* style and quality

* moving forward with tokenizers conversion and tests

* MBart, T5

* dumping the fast version of transformer XL

* Adding to autotokenizers + style/quality

* update init and space_between_special_tokens

* style and quality

* bump up tokenizers version

* add protobuf

* fix pickle Bert JP with Mecab

* fix newly added tokenizers

* style and quality

* fix bert japanese

* fix funnel

* limite tokenizer warning to one occurence

* clean up file

* fix new tokenizers

* fast tokenizers deep tests

* WIP adding all the special fast tests on the new fast tokenizers

* quick fix

* adding more fast tokenizers in the fast tests

* all tokenizers in fast version tested

* Adding BertGenerationFast

* bump up setup.py for CI

* remove BertGenerationFast (too early)

* bump up tokenizers version

* Clean old docstrings

* Typo

* Update following Lysandre comments
Co-authored-by: default avatarSylvain Gugger <sylvain.gugger@gmail.com>
parent 4d04120c
# coding=utf-8
# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import unittest
from transformers.testing_utils import _torch_available
from transformers.tokenization_camembert import CamembertTokenizer, CamembertTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
FRAMEWORK = "pt" if _torch_available else "tf"
class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CamembertTokenizer
rust_tokenizer_class = CamembertTokenizerFast
test_rust_tokenizer = True
def setUp(self):
super().setUp()
# We have a SentencePiece fixture for testing
tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
tokenizer.save_pretrained(self.tmpdirname)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
...@@ -56,7 +56,9 @@ def merge_model_tokenizer_mappings( ...@@ -56,7 +56,9 @@ def merge_model_tokenizer_mappings(
class TokenizerTesterMixin: class TokenizerTesterMixin:
tokenizer_class = None tokenizer_class = None
rust_tokenizer_class = None
test_rust_tokenizer = False test_rust_tokenizer = False
space_between_special_tokens = False
def setUp(self): def setUp(self):
self.tmpdirname = tempfile.mkdtemp() self.tmpdirname = tempfile.mkdtemp()
...@@ -68,12 +70,15 @@ class TokenizerTesterMixin: ...@@ -68,12 +70,15 @@ class TokenizerTesterMixin:
input_txt = self.get_clean_sequence(tokenizer)[0] input_txt = self.get_clean_sequence(tokenizer)[0]
return input_txt, input_txt return input_txt, input_txt
def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20) -> Tuple[str, list]: def get_clean_sequence(self, tokenizer, with_prefix_space=False, max_length=20, min_length=5) -> Tuple[str, list]:
toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))] toks = [(i, tokenizer.decode([i], clean_up_tokenization_spaces=False)) for i in range(len(tokenizer))]
toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks)) toks = list(filter(lambda t: re.match(r"^[ a-zA-Z]+$", t[1]), toks))
toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks)) toks = list(filter(lambda t: [t[0]] == tokenizer.encode(t[1], add_special_tokens=False), toks))
if max_length is not None and len(toks) > max_length: if max_length is not None and len(toks) > max_length:
toks = toks[:max_length] toks = toks[:max_length]
if min_length is not None and len(toks) < min_length and len(toks) > 0:
while len(toks) < min_length:
toks = toks + toks
# toks_str = [t[1] for t in toks] # toks_str = [t[1] for t in toks]
toks_ids = [t[0] for t in toks] toks_ids = [t[0] for t in toks]
...@@ -99,7 +104,7 @@ class TokenizerTesterMixin: ...@@ -99,7 +104,7 @@ class TokenizerTesterMixin:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast: def get_rust_tokenizer(self, **kwargs) -> PreTrainedTokenizerFast:
raise NotImplementedError return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
# def get_input_output_texts(self) -> Tuple[str, str]: # def get_input_output_texts(self) -> Tuple[str, str]:
# """Feel free to overwrite""" # """Feel free to overwrite"""
...@@ -118,6 +123,29 @@ class TokenizerTesterMixin: ...@@ -118,6 +123,29 @@ class TokenizerTesterMixin:
for i in range(len(batch_encode_plus_sequences["input_ids"])) for i in range(len(batch_encode_plus_sequences["input_ids"]))
] ]
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence, _ = self.get_input_output_texts(tokenizer)
# We don't have an exact equivalence on `tokenize()` between Rust and Slow
# Slow tokenizer only split tokens, Rust tokenizers will replace with <unk>
# tokens = tokenizer.tokenize(sequence)
# rust_tokens = rust_tokenizer.tokenize(sequence)
# self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
ids = tokenizer.encode(sequence, add_special_tokens=True)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=True)
self.assertListEqual(ids, rust_ids)
def test_tokenizers_common_properties(self): def test_tokenizers_common_properties(self):
tokenizers = self.get_tokenizers() tokenizers = self.get_tokenizers()
for tokenizer in tokenizers: for tokenizer in tokenizers:
...@@ -241,6 +269,9 @@ class TokenizerTesterMixin: ...@@ -241,6 +269,9 @@ class TokenizerTesterMixin:
tokenizers = self.get_tokenizers(fast=False, do_lower_case=True) tokenizers = self.get_tokenizers(fast=False, do_lower_case=True)
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case:
continue
special_token = tokenizer.all_special_tokens[0] special_token = tokenizer.all_special_tokens[0]
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
...@@ -272,6 +303,9 @@ class TokenizerTesterMixin: ...@@ -272,6 +303,9 @@ class TokenizerTesterMixin:
tokenizers = self.get_tokenizers(fast=False, do_lower_case=False) tokenizers = self.get_tokenizers(fast=False, do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case:
continue
special_token = tokenizer.all_special_tokens[0] special_token = tokenizer.all_special_tokens[0]
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
...@@ -282,7 +316,7 @@ class TokenizerTesterMixin: ...@@ -282,7 +316,7 @@ class TokenizerTesterMixin:
toks0 = tokenizer.tokenize(text) # toks before adding new_toks toks0 = tokenizer.tokenize(text) # toks before adding new_toks
added = tokenizer.add_tokens(new_toks) added = tokenizer.add_tokens(new_toks)
self.assertEqual(added, 4) self.assertIn(added, [2, 4])
toks = tokenizer.tokenize(text) toks = tokenizer.tokenize(text)
toks2 = tokenizer.tokenize(text2) toks2 = tokenizer.tokenize(text2)
...@@ -390,12 +424,17 @@ class TokenizerTesterMixin: ...@@ -390,12 +424,17 @@ class TokenizerTesterMixin:
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
new_toks = ["[ABC]", "[DEF]"] # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"] # new_toks = ["[ABC]", "[DEF]"] # TODO(thom) add this one back when Rust toks are ready: , "GHI IHG"]
new_toks = [AddedToken("[ABC]", normalized=False), AddedToken("[DEF]", normalized=False)]
tokenizer.add_tokens(new_toks) tokenizer.add_tokens(new_toks)
input = "[ABC] [DEF] [ABC] [DEF]" # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]" input = "[ABC][DEF][ABC][DEF]" # TODO(thom) add back cf above: "[ABC] [DEF] [ABC] GHI IHG [DEF]"
if self.space_between_special_tokens:
output = "[ABC] [DEF] [ABC] [DEF]"
else:
output = input
encoded = tokenizer.encode(input, add_special_tokens=False) encoded = tokenizer.encode(input, add_special_tokens=False)
decoded = tokenizer.decode(encoded) decoded = tokenizer.decode(encoded, spaces_between_special_tokens=self.space_between_special_tokens)
self.assertEqual(decoded, input) self.assertIn(decoded, [output, output.lower()])
def test_pretrained_model_lists(self): def test_pretrained_model_lists(self):
weights_list = list(self.tokenizer_class.max_model_input_sizes.keys()) weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
...@@ -447,7 +486,7 @@ class TokenizerTesterMixin: ...@@ -447,7 +486,7 @@ class TokenizerTesterMixin:
sequence = tokenizer.encode(seq_0, add_special_tokens=False) sequence = tokenizer.encode(seq_0, add_special_tokens=False)
total_length = len(sequence) total_length = len(sequence)
assert total_length > 1, "Issue with the testing sequence, please update it it's too short" assert total_length > 4, "Issue with the testing sequence, please update it it's too short"
# Test with max model input length # Test with max model input length
model_max_length = tokenizer.model_max_length model_max_length = tokenizer.model_max_length
...@@ -546,6 +585,7 @@ class TokenizerTesterMixin: ...@@ -546,6 +585,7 @@ class TokenizerTesterMixin:
model_max_length = tokenizer.model_max_length model_max_length = tokenizer.model_max_length
self.assertEqual(model_max_length, 100) self.assertEqual(model_max_length, 100)
seq_2 = seq_0 * model_max_length seq_2 = seq_0 * model_max_length
assert len(seq_2) > model_max_length
sequence1 = tokenizer(seq_1, add_special_tokens=False) sequence1 = tokenizer(seq_1, add_special_tokens=False)
total_length1 = len(sequence1["input_ids"]) total_length1 = len(sequence1["input_ids"])
...@@ -559,9 +599,9 @@ class TokenizerTesterMixin: ...@@ -559,9 +599,9 @@ class TokenizerTesterMixin:
[False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False] [False, True, "longest"] if tokenizer.pad_token and tokenizer.pad_token_id >= 0 else [False]
) )
for padding_state in padding_strategies: for padding_state in padding_strategies:
with self.subTest(f"Padding: {padding_state}"): with self.subTest(f"{tokenizer.__class__.__name__} Padding: {padding_state}"):
for truncation_state in [True, "longest_first", "only_first"]: for truncation_state in [True, "longest_first", "only_first"]:
with self.subTest(f"Truncation: {truncation_state}"): with self.subTest(f"{tokenizer.__class__.__name__} Truncation: {truncation_state}"):
output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state) output = tokenizer(seq_2, seq_1, padding=padding_state, truncation=truncation_state)
self.assertEqual(len(output["input_ids"]), model_max_length) self.assertEqual(len(output["input_ids"]), model_max_length)
...@@ -748,34 +788,47 @@ class TokenizerTesterMixin: ...@@ -748,34 +788,47 @@ class TokenizerTesterMixin:
# # This is not supported with the Rust tokenizers # # This is not supported with the Rust tokenizers
# # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input) # # self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
def test_swap_special_token(self): # def test_swap_special_token(self):
tokenizers = self.get_tokenizers(do_lower_case=False) # tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: # for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): # with self.subTest(f"{tokenizer.__class__.__name__}"):
mask = "<mask>" # # Our mask token
sequence = "Encode this sequence" # mask = "<mask>"
sequence_masked_0 = "Encode <mask> sequence" # # We take a single word in the middle of the vocabulary
sequence_masked_1 = "<mask> this sequence" # all_tokens = sorted(tokenizer.get_vocab().keys())
# word = tokenizer.decode(tokenizer.encode(all_tokens[len(all_tokens)//2], add_special_tokens=False)[:1])
# Add tokens so that masked token isn't split
tokenizer.add_tokens(sequence.split()) # sequence_0 = "Encode " + word + " sequence"
tokenizer.add_special_tokens({"mask_token": mask}) # sequence_masked_0 = "Encode " + mask + " sequence"
mask_ind = tokenizer.convert_tokens_to_ids(mask)
encoded = tokenizer.encode(sequence, add_special_tokens=False) # sequence_1 = word + " this sequence"
# sequence_masked_1 = mask + " this sequence"
# Test first masked sequence
encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False) # # Add tokens so that masked token isn't split
mask_loc = encoded_masked.index(mask_ind) # # tokens = [AddedToken(t, lstrip=True, normalized=False) for t in sequence.split()]
encoded_masked[mask_loc] = encoded[mask_loc] # # tokenizer.add_tokens(tokens)
# tokenizer.add_special_tokens(
self.assertEqual(encoded_masked, encoded) # {"mask_token": AddedToken(mask, normalized=False)}
# ) # Eat left space on Byte-level BPE tokenizers
# Test second masked sequence # mask_ind = tokenizer.convert_tokens_to_ids(mask)
encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
mask_loc = encoded_masked.index(mask_ind) # # Test first masked sequence
encoded_masked[mask_loc] = encoded[mask_loc] # encoded_0 = tokenizer.encode(sequence_0, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_0, add_special_tokens=False)
self.assertEqual(encoded_masked, encoded) # assert len(encoded_masked) == len(encoded_0)
# mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_0[mask_loc]
# self.assertEqual(encoded_masked, encoded_0)
# # Test second masked sequence
# encoded_1 = tokenizer.encode(sequence_1, add_special_tokens=False)
# encoded_masked = tokenizer.encode(sequence_masked_1, add_special_tokens=False)
# assert len(encoded_masked) == len(encoded_1)
# mask_loc = encoded_masked.index(mask_ind)
# encoded_masked[mask_loc] = encoded_1[mask_loc]
# self.assertEqual(encoded_masked, encoded_1)
def test_special_tokens_mask(self): def test_special_tokens_mask(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
...@@ -919,10 +972,10 @@ class TokenizerTesterMixin: ...@@ -919,10 +972,10 @@ class TokenizerTesterMixin:
def test_padding_to_multiple_of(self): def test_padding_to_multiple_of(self):
tokenizers = self.get_tokenizers() tokenizers = self.get_tokenizers()
for tokenizer in tokenizers: for tokenizer in tokenizers:
if tokenizer.pad_token is None: with self.subTest(f"{tokenizer.__class__.__name__}"):
self.skipTest("No padding token.") if tokenizer.pad_token is None:
else: self.skipTest("No padding token.")
with self.subTest(f"{tokenizer.__class__.__name__}"): else:
empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8) empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8) normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
for key, value in empty_tokens.items(): for key, value in empty_tokens.items():
...@@ -1063,14 +1116,15 @@ class TokenizerTesterMixin: ...@@ -1063,14 +1116,15 @@ class TokenizerTesterMixin:
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
vocab = tokenizer.get_vocab() vocab_dict = tokenizer.get_vocab()
self.assertIsInstance(vocab_dict, dict)
self.assertGreaterEqual(len(tokenizer), len(vocab_dict))
self.assertIsInstance(vocab, dict) vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
self.assertEqual(len(vocab), len(tokenizer)) self.assertEqual(len(vocab), len(tokenizer))
tokenizer.add_tokens(["asdfasdfasdfasdf"]) tokenizer.add_tokens(["asdfasdfasdfasdf"])
vocab = tokenizer.get_vocab() vocab = [tokenizer.convert_ids_to_tokens(i) for i in range(len(tokenizer))]
self.assertIsInstance(vocab, dict)
self.assertEqual(len(vocab), len(tokenizer)) self.assertEqual(len(vocab), len(tokenizer))
def test_conversion_reversible(self): def test_conversion_reversible(self):
...@@ -1079,6 +1133,8 @@ class TokenizerTesterMixin: ...@@ -1079,6 +1133,8 @@ class TokenizerTesterMixin:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
vocab = tokenizer.get_vocab() vocab = tokenizer.get_vocab()
for word, ind in vocab.items(): for word, ind in vocab.items():
if word == tokenizer.unk_token:
continue
self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind) self.assertEqual(tokenizer.convert_tokens_to_ids(word), ind)
self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word) self.assertEqual(tokenizer.convert_ids_to_tokens(ind), word)
...@@ -1173,12 +1229,13 @@ class TokenizerTesterMixin: ...@@ -1173,12 +1229,13 @@ class TokenizerTesterMixin:
def test_added_token_serializable(self): def test_added_token_serializable(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
new_token = AddedToken("new_token", lstrip=True) with self.subTest(f"{tokenizer.__class__.__name__}"):
tokenizer.add_special_tokens({"additional_special_tokens": [new_token]}) new_token = AddedToken("new_token", lstrip=True)
tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
with tempfile.TemporaryDirectory() as tmp_dir_name: with tempfile.TemporaryDirectory() as tmp_dir_name:
tokenizer.save_pretrained(tmp_dir_name) tokenizer.save_pretrained(tmp_dir_name)
tokenizer.from_pretrained(tmp_dir_name) tokenizer.from_pretrained(tmp_dir_name)
def test_batch_encode_plus_padding(self): def test_batch_encode_plus_padding(self):
# Test that padded sequences are equivalent between batch_encode_plus and encode_plus # Test that padded sequences are equivalent between batch_encode_plus and encode_plus
...@@ -1243,6 +1300,9 @@ class TokenizerTesterMixin: ...@@ -1243,6 +1300,9 @@ class TokenizerTesterMixin:
for tokenizer in tokenizers: for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"): with self.subTest(f"{tokenizer.__class__.__name__}"):
if hasattr(tokenizer, "add_prefix_space") and not tokenizer.add_prefix_space:
continue
# Prepare a sequence from our tokenizer vocabulary # Prepare a sequence from our tokenizer vocabulary
sequence, ids = self.get_clean_sequence(tokenizer, with_prefix_space=True, max_length=20) sequence, ids = self.get_clean_sequence(tokenizer, with_prefix_space=True, max_length=20)
# sequence = " " + sequence # To be sure the byte-level tokenizers are feeling good # sequence = " " + sequence # To be sure the byte-level tokenizers are feeling good
...@@ -1345,12 +1405,14 @@ class TokenizerTesterMixin: ...@@ -1345,12 +1405,14 @@ class TokenizerTesterMixin:
def test_prepare_for_model(self): def test_prepare_for_model(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers: for tokenizer in tokenizers:
string_sequence = "Testing the prepare_for_model method." with self.subTest(f"{tokenizer.__class__.__name__}"):
ids = tokenizer.encode(string_sequence, add_special_tokens=False) string_sequence = "Testing the prepare_for_model method."
input_dict = tokenizer.encode_plus(string_sequence) ids = tokenizer.encode(string_sequence, add_special_tokens=False)
prepared_input_dict = tokenizer.prepare_for_model(ids) prepared_input_dict = tokenizer.prepare_for_model(ids, add_special_tokens=True)
input_dict = tokenizer.encode_plus(string_sequence, add_special_tokens=True)
self.assertEqual(input_dict, prepared_input_dict) self.assertEqual(input_dict, prepared_input_dict)
def test_batch_encode_plus_overflowing_tokens(self): def test_batch_encode_plus_overflowing_tokens(self):
tokenizers = self.get_tokenizers(do_lower_case=False) tokenizers = self.get_tokenizers(do_lower_case=False)
......
...@@ -25,6 +25,7 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -25,6 +25,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CTRLTokenizer tokenizer_class = CTRLTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -23,9 +23,8 @@ from .test_tokenization_bert import BertTokenizationTest ...@@ -23,9 +23,8 @@ from .test_tokenization_bert import BertTokenizationTest
class DistilBertTokenizationTest(BertTokenizationTest): class DistilBertTokenizationTest(BertTokenizationTest):
tokenizer_class = DistilBertTokenizer tokenizer_class = DistilBertTokenizer
rust_tokenizer_class = DistilBertTokenizerFast
def get_rust_tokenizer(self, **kwargs): test_rust_tokenizer = True
return DistilBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@slow @slow
def test_sequence_builders(self): def test_sequence_builders(self):
......
...@@ -32,25 +32,22 @@ from .test_tokenization_bert import BertTokenizationTest ...@@ -32,25 +32,22 @@ from .test_tokenization_bert import BertTokenizationTest
class DPRContextEncoderTokenizationTest(BertTokenizationTest): class DPRContextEncoderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRContextEncoderTokenizer tokenizer_class = DPRContextEncoderTokenizer
rust_tokenizer_class = DPRContextEncoderTokenizerFast
def get_rust_tokenizer(self, **kwargs): test_rust_tokenizer = True
return DPRContextEncoderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
class DPRQuestionEncoderTokenizationTest(BertTokenizationTest): class DPRQuestionEncoderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRQuestionEncoderTokenizer tokenizer_class = DPRQuestionEncoderTokenizer
rust_tokenizer_class = DPRQuestionEncoderTokenizerFast
def get_rust_tokenizer(self, **kwargs): test_rust_tokenizer = True
return DPRQuestionEncoderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
class DPRReaderTokenizationTest(BertTokenizationTest): class DPRReaderTokenizationTest(BertTokenizationTest):
tokenizer_class = DPRReaderTokenizer tokenizer_class = DPRReaderTokenizer
rust_tokenizer_class = DPRReaderTokenizerFast
def get_rust_tokenizer(self, **kwargs): test_rust_tokenizer = True
return DPRReaderTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
@slow @slow
def test_decode_best_spans(self): def test_decode_best_spans(self):
......
import logging import logging
import shutil
import tempfile
import unittest import unittest
from collections import namedtuple from collections import namedtuple
from itertools import takewhile from itertools import takewhile
from transformers import ( from transformers import (
AlbertTokenizer,
AlbertTokenizerFast,
BartTokenizer,
BartTokenizerFast,
BertTokenizer, BertTokenizer,
BertTokenizerFast, BertTokenizerFast,
CamembertTokenizer,
CamembertTokenizerFast,
DistilBertTokenizer, DistilBertTokenizer,
DistilBertTokenizerFast,
DPRContextEncoderTokenizer,
DPRContextEncoderTokenizerFast,
DPRQuestionEncoderTokenizer,
DPRQuestionEncoderTokenizerFast,
DPRReaderTokenizer,
DPRReaderTokenizerFast,
FunnelTokenizer,
FunnelTokenizerFast,
GPT2Tokenizer, GPT2Tokenizer,
GPT2TokenizerFast, GPT2TokenizerFast,
LxmertTokenizer,
LxmertTokenizerFast,
MBartTokenizer,
MBartTokenizerFast,
OpenAIGPTTokenizer, OpenAIGPTTokenizer,
PreTrainedTokenizer, OpenAIGPTTokenizerFast,
PegasusTokenizer,
PegasusTokenizerFast,
ReformerTokenizer,
ReformerTokenizerFast,
RobertaTokenizer, RobertaTokenizer,
RobertaTokenizerFast,
T5Tokenizer,
T5TokenizerFast,
XLMRobertaTokenizer,
XLMRobertaTokenizerFast,
XLNetTokenizer,
XLNetTokenizerFast,
is_torch_available, is_torch_available,
) )
from transformers.testing_utils import get_tests_dir from transformers.testing_utils import get_tests_dir
from transformers.tokenization_distilbert import DistilBertTokenizerFast
from transformers.tokenization_openai import OpenAIGPTTokenizerFast
from transformers.tokenization_roberta import RobertaTokenizerFast
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -40,245 +69,261 @@ class CommonFastTokenizerTest(unittest.TestCase): ...@@ -40,245 +69,261 @@ class CommonFastTokenizerTest(unittest.TestCase):
TOKENIZERS_CLASSES = frozenset([]) TOKENIZERS_CLASSES = frozenset([])
def setUp(self) -> None: def setUp(self) -> None:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name)
self.tokenizers_list = [
(tok_case, pretrained_name, dict(t for t in tok_case.kwargs) if tok_case.kwargs else {})
for tok_case in self.TOKENIZERS_CLASSES
for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys()
if tok_case.filter is None or (tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name))
]
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
self._data = f_data.read().replace("\n\n", "\n").strip() self._data = f_data.read().replace("\n\n", "\n").strip()
def test_all_tokenizers(self): self.tmpdirname = tempfile.mkdtemp()
for tok_case in self.TOKENIZERS_CLASSES:
for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys(): def tearDown(self):
shutil.rmtree(self.tmpdirname)
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name) def test_is_fast(self):
if tok_case.filter is None or ( for tok_case, pretrained_name, kwargs in self.tokenizers_list:
tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
): tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
kwargs = dict(t for t in tok_case.kwargs) if tok_case.kwargs else {} tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) # Check is_fast is set correctly
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs) self.assertFalse(tokenizer_p.is_fast)
self.assertTrue(tokenizer_r.is_fast)
self.fast_align_python(tokenizer_r, tokenizer_p, tok_case, pretrained_name)
self.fast_only(tokenizer_r) def test_fast_only_inputs(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
def test_pretokenized_tokenizers(self): with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
for tok_case in self.TOKENIZERS_CLASSES: tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys():
# Ensure None raise an error
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the self.assertRaises(TypeError, tokenizer_r.tokenize, None)
# information available in Tokenizer (name, rust class, python class, vocab key name) self.assertRaises(TypeError, tokenizer_r.encode, None)
if tok_case.filter is None or ( self.assertRaises(TypeError, tokenizer_r.encode_plus, None)
tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name) self.assertRaises(TypeError, tokenizer_r.batch_encode_plus, None)
):
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)): def test_alignement_methods(self):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, add_prefix_space=True) for tok_case, pretrained_name, kwargs in self.tokenizers_list:
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, add_prefix_space=True) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
self.assert_pretokenized_inputs(tokenizer_r, tokenizer_p)
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"]
def fast_align_python(self, tokenizer_r, tokenizer_p, tok_case, pretrained_name): text = " ".join(words)
# Check is_fast is set correctly batch_size = 3
self.assertFalse(tokenizer_p.is_fast)
self.assertTrue(tokenizer_r.is_fast) encoding = tokenizer_r.encode_plus(text, add_special_tokens=False)
# Check that Rust and Python align batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False)
self.assert_tokenization_python_rust_equals(tokenizer_r, tokenizer_p) num_tokens = len(encoding["input_ids"])
self.assert_num_special_tokens_to_add_equal(tokenizer_r, tokenizer_p)
self.assert_max_length_equal(tokenizer_r, tokenizer_p) last_word_index = len(words) - 1
self.assert_special_tokens_map_equal(tokenizer_r, tokenizer_p) last_token_index = num_tokens - 1
self.assert_embeded_special_tokens(tokenizer_r, tokenizer_p) last_batch_index = batch_size - 1
self.assert_padding(tokenizer_r, tokenizer_p) last_char_index = len(text) - 1
self.assert_create_token_type_ids(tokenizer_r, tokenizer_p)
self.assert_prepare_for_model(tokenizer_r, tokenizer_p) # words, tokens
self.assertEqual(len(encoding.words(0)), num_tokens)
def fast_only(self, tokenizer_r): self.assertEqual(max(encoding.words(0)), last_word_index)
# Ensure None raise an error self.assertEqual(min(encoding.words(0)), 0)
self.assertRaises(ValueError, tokenizer_r.tokenize, None) self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens)
self.assertRaises(ValueError, tokenizer_r.encode, None) self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index)
self.assertRaises(ValueError, tokenizer_r.encode_plus, None) self.assertEqual(min(batch_encoding.words(last_batch_index)), 0)
self.assertRaises(ValueError, tokenizer_r.batch_encode_plus, None) self.assertEqual(len(encoding.tokens(0)), num_tokens)
self.assert_add_tokens(tokenizer_r) # Assert token_to_word
self.assert_offsets_mapping(tokenizer_r) self.assertEqual(encoding.token_to_word(0), 0)
self.assert_add_special_tokens(tokenizer_r) self.assertEqual(encoding.token_to_word(0, 0), 0)
self.assert_alignement_methods(tokenizer_r) self.assertEqual(encoding.token_to_word(last_token_index), last_word_index)
self.assert_batch_encode_dynamic_overflowing(tokenizer_r) self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index)
self.assertEqual(batch_encoding.token_to_word(1, 0), 0)
def assert_alignement_methods(self, tokenizer_r): self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index)
words = ["Wonderful", "no", "inspiration", "example", "with", "subtoken"] self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index)
text = " ".join(words)
batch_size = 3 # Assert word_to_tokens
self.assertEqual(encoding.word_to_tokens(0).start, 0)
encoding = tokenizer_r.encode_plus(text, add_special_tokens=False) self.assertEqual(encoding.word_to_tokens(0, 0).start, 0)
self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1)
batch_encoding = tokenizer_r.batch_encode_plus([text] * batch_size, add_special_tokens=False) self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
num_tokens = len(encoding["input_ids"]) self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0)
self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1)
last_word_index = len(words) - 1 self.assertEqual(
last_token_index = num_tokens - 1 batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1
last_batch_index = batch_size - 1 )
last_char_index = len(text) - 1
# Assert token_to_chars
# words, tokens self.assertEqual(encoding.token_to_chars(0).start, 0)
self.assertEqual(len(encoding.words(0)), num_tokens) self.assertEqual(encoding.token_to_chars(0, 0).start, 0)
self.assertEqual(max(encoding.words(0)), last_word_index) self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1)
self.assertEqual(min(encoding.words(0)), 0) self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
self.assertEqual(len(batch_encoding.words(last_batch_index)), num_tokens) self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0)
self.assertEqual(max(batch_encoding.words(last_batch_index)), last_word_index) self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1)
self.assertEqual(min(batch_encoding.words(last_batch_index)), 0) self.assertEqual(
self.assertEqual(len(encoding.tokens(0)), num_tokens) batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1
)
# Assert token_to_word
self.assertEqual(encoding.token_to_word(0), 0) # Assert char_to_token
self.assertEqual(encoding.token_to_word(0, 0), 0) self.assertEqual(encoding.char_to_token(0), 0)
self.assertEqual(encoding.token_to_word(last_token_index), last_word_index) self.assertEqual(encoding.char_to_token(0, 0), 0)
self.assertEqual(encoding.token_to_word(0, last_token_index), last_word_index) self.assertEqual(encoding.char_to_token(last_char_index), last_token_index)
self.assertEqual(batch_encoding.token_to_word(1, 0), 0) self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index)
self.assertEqual(batch_encoding.token_to_word(0, last_token_index), last_word_index) self.assertEqual(batch_encoding.char_to_token(1, 0), 0)
self.assertEqual(batch_encoding.token_to_word(last_batch_index, last_token_index), last_word_index) self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index)
self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index)
# Assert word_to_tokens
self.assertEqual(encoding.word_to_tokens(0).start, 0) # Assert char_to_word
self.assertEqual(encoding.word_to_tokens(0, 0).start, 0) self.assertEqual(encoding.char_to_word(0), 0)
self.assertEqual(encoding.word_to_tokens(last_word_index).end, last_token_index + 1) self.assertEqual(encoding.char_to_word(0, 0), 0)
self.assertEqual(encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1) self.assertEqual(encoding.char_to_word(last_char_index), last_word_index)
self.assertEqual(batch_encoding.word_to_tokens(1, 0).start, 0) self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index)
self.assertEqual(batch_encoding.word_to_tokens(0, last_word_index).end, last_token_index + 1) self.assertEqual(batch_encoding.char_to_word(1, 0), 0)
self.assertEqual(batch_encoding.word_to_tokens(last_batch_index, last_word_index).end, last_token_index + 1) self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index)
self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index)
# Assert token_to_chars
self.assertEqual(encoding.token_to_chars(0).start, 0) # Assert word_to_chars
self.assertEqual(encoding.token_to_chars(0, 0).start, 0) self.assertEqual(encoding.word_to_chars(0).start, 0)
self.assertEqual(encoding.token_to_chars(last_token_index).end, last_char_index + 1) self.assertEqual(encoding.word_to_chars(0, 0).start, 0)
self.assertEqual(encoding.token_to_chars(0, last_token_index).end, last_char_index + 1) self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1)
self.assertEqual(batch_encoding.token_to_chars(1, 0).start, 0) self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
self.assertEqual(batch_encoding.token_to_chars(0, last_token_index).end, last_char_index + 1) self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0)
self.assertEqual(batch_encoding.token_to_chars(last_batch_index, last_token_index).end, last_char_index + 1) self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1)
self.assertEqual(
# Assert char_to_token batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1
self.assertEqual(encoding.char_to_token(0), 0) )
self.assertEqual(encoding.char_to_token(0, 0), 0)
self.assertEqual(encoding.char_to_token(last_char_index), last_token_index) def test_tokenization_python_rust_equals(self):
self.assertEqual(encoding.char_to_token(0, last_char_index), last_token_index) for tok_case, pretrained_name, kwargs in self.tokenizers_list:
self.assertEqual(batch_encoding.char_to_token(1, 0), 0) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
self.assertEqual(batch_encoding.char_to_token(0, last_char_index), last_token_index) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
self.assertEqual(batch_encoding.char_to_token(last_batch_index, last_char_index), last_token_index) tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Assert char_to_word # Ensure basic input match
self.assertEqual(encoding.char_to_word(0), 0) input_p = tokenizer_p.encode_plus(self._data)
self.assertEqual(encoding.char_to_word(0, 0), 0) input_r = tokenizer_r.encode_plus(self._data)
self.assertEqual(encoding.char_to_word(last_char_index), last_word_index)
self.assertEqual(encoding.char_to_word(0, last_char_index), last_word_index) for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertEqual(batch_encoding.char_to_word(1, 0), 0) self.assertSequenceEqual(input_p[key], input_r[key])
self.assertEqual(batch_encoding.char_to_word(0, last_char_index), last_word_index)
self.assertEqual(batch_encoding.char_to_word(last_batch_index, last_char_index), last_word_index) input_pairs_p = tokenizer_p.encode_plus(self._data, self._data)
input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
# Assert word_to_chars
self.assertEqual(encoding.word_to_chars(0).start, 0) for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertEqual(encoding.word_to_chars(0, 0).start, 0) self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
self.assertEqual(encoding.word_to_chars(last_word_index).end, last_char_index + 1)
self.assertEqual(encoding.word_to_chars(0, last_word_index).end, last_char_index + 1) # Ensure truncation match
self.assertEqual(batch_encoding.word_to_chars(1, 0).start, 0) input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True)
self.assertEqual(batch_encoding.word_to_chars(0, last_word_index).end, last_char_index + 1) input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True)
self.assertEqual(batch_encoding.word_to_chars(last_batch_index, last_word_index).end, last_char_index + 1)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
def assert_tokenization_python_rust_equals(self, tokenizer_r, tokenizer_p): self.assertSequenceEqual(input_p[key], input_r[key])
# Ensure basic input match
input_p = tokenizer_p.encode_plus(self._data) # Ensure truncation with stride match
input_r = tokenizer_r.encode_plus(self._data) input_p = tokenizer_p.encode_plus(
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): )
self.assertSequenceEqual(input_p[key], input_r[key]) input_r = tokenizer_r.encode_plus(
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
input_pairs_p = tokenizer_p.encode_plus(self._data, self._data) )
input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): self.assertSequenceEqual(input_p[key], input_r[key][0])
self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
def test_num_special_tokens_to_add_equal(self):
# Ensure truncation match for tok_case, pretrained_name, kwargs in self.tokenizers_list:
input_p = tokenizer_p.encode_plus(self._data, max_length=512, truncation=True) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
input_r = tokenizer_r.encode_plus(self._data, max_length=512, truncation=True) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
self.assertSequenceEqual(input_p[key], input_r[key]) # Check we have the same number of added_tokens for both pair and non-pair inputs.
self.assertEqual(
# Ensure truncation with stride match tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False)
input_p = tokenizer_p.encode_plus( )
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True self.assertEqual(
) tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True)
input_r = tokenizer_r.encode_plus( )
self._data, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
) def test_max_length_equal(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()): with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
self.assertSequenceEqual(input_p[key], input_r[key][0]) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
def assert_num_special_tokens_to_add_equal(self, tokenizer_r, tokenizer_p):
# Check we have the same number of added_tokens for both pair and non-pair inputs. # Check we have the correct max_length for both pair and non-pair inputs.
self.assertEqual(tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False)) self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
self.assertEqual(tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True)) self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
def assert_max_length_equal(self, tokenizer_r, tokenizer_p): def test_special_tokens_map_equal(self):
# Check we have the correct max_length for both pair and non-pair inputs. for tok_case, pretrained_name, kwargs in self.tokenizers_list:
self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
def assert_special_tokens_map_equal(self, tokenizer_r, tokenizer_p):
# Assert the set of special tokens match. # Assert the set of special tokens match.
self.assertSequenceEqual( self.assertSequenceEqual(
tokenizer_p.special_tokens_map.items(), tokenizer_p.special_tokens_map.items(),
tokenizer_r.special_tokens_map.items(), tokenizer_r.special_tokens_map.items(),
) )
def assert_add_tokens(self, tokenizer_r): def test_add_tokens(self):
vocab_size = tokenizer_r.vocab_size for tok_case, pretrained_name, kwargs in self.tokenizers_list:
self.assertEqual(tokenizer_r.add_tokens(""), 0) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
self.assertEqual(tokenizer_r.add_tokens("testoken"), 1) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
self.assertEqual(len(tokenizer_r), vocab_size + 3) vocab_size = len(tokenizer_r)
self.assertEqual(tokenizer_r.add_tokens(""), 0)
self.assertEqual(tokenizer_r.add_special_tokens({}), 0) self.assertEqual(tokenizer_r.add_tokens("testoken"), 1)
self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2) self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
self.assertRaises( self.assertEqual(len(tokenizer_r), vocab_size + 3)
AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
) self.assertEqual(tokenizer_r.add_special_tokens({}), 0)
self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1) self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
self.assertEqual( self.assertRaises(
tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2 AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
) )
self.assertEqual(len(tokenizer_r), vocab_size + 8) self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
self.assertEqual(
def assert_offsets_mapping(self, tokenizer_r): tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
text = "Wonderful no inspiration example with subtoken" )
pair = "Along with an awesome pair" self.assertEqual(len(tokenizer_r), vocab_size + 8)
# No pair def test_offsets_mapping(self):
tokens_with_offsets = tokenizer_r.encode_plus( for tok_case, pretrained_name, kwargs in self.tokenizers_list:
text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
added_tokens = tokenizer_r.num_special_tokens_to_add(False)
offsets = tokens_with_offsets["offset_mapping"] text = "Wonderful no inspiration example with subtoken"
pair = "Along with an awesome pair"
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) # No pair
tokens_with_offsets = tokenizer_r.encode_plus(
# Assert there is online added_tokens special_tokens text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) )
added_tokens = tokenizer_r.num_special_tokens_to_add(False)
# Pairs offsets = tokens_with_offsets["offset_mapping"]
tokens_with_offsets = tokenizer_r.encode_plus(
text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True # Assert there is the same number of tokens and offsets
) self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
added_tokens = tokenizer_r.num_special_tokens_to_add(True)
offsets = tokens_with_offsets["offset_mapping"] # Assert there is online added_tokens special_tokens
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"])) # Pairs
tokens_with_offsets = tokenizer_r.encode_plus(
# Assert there is online added_tokens special_tokens text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens) )
added_tokens = tokenizer_r.num_special_tokens_to_add(True)
def assert_batch_encode_dynamic_overflowing(self, tokenizer: PreTrainedTokenizer): offsets = tokens_with_offsets["offset_mapping"]
# Assert there is the same number of tokens and offsets
self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
# Assert there is online added_tokens special_tokens
self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
def test_batch_encode_dynamic_overflowing(self):
""" """
When calling batch_encode with multiple sequence it can returns different number of When calling batch_encode with multiple sequence it can returns different number of
overflowing encoding for each sequence: overflowing encoding for each sequence:
...@@ -289,437 +334,515 @@ class CommonFastTokenizerTest(unittest.TestCase): ...@@ -289,437 +334,515 @@ class CommonFastTokenizerTest(unittest.TestCase):
] ]
This needs to be padded so that it can represented as a tensor This needs to be padded so that it can represented as a tensor
""" """
returned_tensor = "pt" if is_torch_available() else "tf" for tok_case, pretrained_name, kwargs in self.tokenizers_list:
tokenizer = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
return with self.subTest("{} ({}, {})".format(tok_case.name, pretrained_name, tokenizer.__class__.__name__)):
tokens = tokenizer.encode_plus( returned_tensor = "pt" if is_torch_available() else "tf"
"HuggingFace is solving NLP one commit at a time",
max_length=6, if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
padding=True, return
truncation=True,
return_tensors=returned_tensor, tokens = tokenizer.encode_plus(
return_overflowing_tokens=True, "HuggingFace is solving NLP one commit at a time",
) max_length=6,
padding=True,
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): truncation=True,
self.assertEqual(len(tokens[key].shape), 2) return_tensors=returned_tensor,
return_overflowing_tokens=True,
# Mono sample )
tokens = tokenizer.batch_encode_plus(
["HuggingFace is solving NLP one commit at a time"], for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
max_length=6, self.assertEqual(len(tokens[key].shape), 2)
padding=True,
truncation="only_first", # Mono sample
return_tensors=returned_tensor, tokens = tokenizer.batch_encode_plus(
return_overflowing_tokens=True, ["HuggingFace is solving NLP one commit at a time"],
) max_length=6,
padding=True,
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): truncation="only_first",
self.assertEqual(len(tokens[key].shape), 2) return_tensors=returned_tensor,
self.assertEqual(tokens[key].shape[-1], 6) return_overflowing_tokens=True,
)
# Multi sample
tokens = tokenizer.batch_encode_plus( for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
["HuggingFace is solving NLP one commit at a time", "Very tiny input"], self.assertEqual(len(tokens[key].shape), 2)
max_length=6, self.assertEqual(tokens[key].shape[-1], 6)
padding=True,
truncation="only_first", # Multi sample
return_tensors=returned_tensor, tokens = tokenizer.batch_encode_plus(
return_overflowing_tokens=True, ["HuggingFace is solving NLP one commit at a time", "Very tiny input"],
) max_length=6,
padding=True,
for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()): truncation="only_first",
self.assertEqual(len(tokens[key].shape), 2) return_tensors=returned_tensor,
self.assertEqual(tokens[key].shape[-1], 6) return_overflowing_tokens=True,
)
def assert_pretokenized_inputs(self, tokenizer_r, tokenizer_p):
# Input string for key in filter(lambda x: "overflow_to_sample_mapping" not in x, tokens.keys()):
pretokenized_input_simple = "This is a sample input".split() self.assertEqual(len(tokens[key].shape), 2)
pretokenized_input_pair = "This is a sample pair".split() self.assertEqual(tokens[key].shape[-1], 6)
# Test encode for pretokenized inputs def test_pretokenized_inputs(self):
output_r = tokenizer_r.encode(pretokenized_input_simple, is_split_into_words=True) for tok_case, pretrained_name, kwargs in self.tokenizers_list:
output_p = tokenizer_p.encode(pretokenized_input_simple, is_split_into_words=True) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
self.assertEqual(output_p, output_r) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
kwargs = { # Input string
"is_split_into_words": True, pretokenized_input_simple = "This is a sample input".split()
"return_token_type_ids": True, pretokenized_input_pair = "This is a sample pair".split()
"return_attention_mask": True,
"return_overflowing_tokens": False, # Test encode for pretokenized inputs
"return_special_tokens_mask": True, output_r = tokenizer_r.encode(
"return_offsets_mapping": False, # Not implemented in python tokenizers pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
} )
batch_kwargs = { output_p = tokenizer_p.encode(
"is_split_into_words": True, pretokenized_input_simple, is_split_into_words=True, add_special_tokens=False
"return_token_type_ids": True, )
"return_attention_mask": True, # we have an 's' here self.assertEqual(output_p, output_r)
"return_overflowing_tokens": False,
"return_special_tokens_mask": True, # we have an 's' here kwargs = {
"return_offsets_mapping": False, # Not implemented in python tokenizers "is_split_into_words": True,
} # "return_token_type_ids": True, # Use the defaults for each tokenizers
# Test encode_plus for pretokenized inputs # "return_attention_mask": True, # Use the defaults for each tokenizers
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs) "return_overflowing_tokens": False,
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs) "return_special_tokens_mask": True,
for key in output_p.keys(): "return_offsets_mapping": False, # Not implemented in python tokenizers
self.assertEqual(output_p[key], output_r[key]) # "add_special_tokens": False,
}
# Test batch_encode_plus for pretokenized inputs batch_kwargs = {
input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair] "is_split_into_words": True,
output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs) # "return_token_type_ids": True, # Use the defaults for each tokenizers
output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs) # "return_attention_mask": True, # Use the defaults for each tokenizers
for key in output_p.keys(): "return_overflowing_tokens": False,
self.assertEqual(output_p[key], output_r[key]) "return_special_tokens_mask": True,
"return_offsets_mapping": False, # Not implemented in python tokenizers
# Test encode for pretokenized inputs pairs # "add_special_tokens": False,
output_r = tokenizer_r.encode(pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True) }
output_p = tokenizer_p.encode(pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True) # Test encode_plus for pretokenized inputs
self.assertEqual(output_p, output_r) output_r = tokenizer_r.encode_plus(pretokenized_input_simple, **kwargs)
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, **kwargs)
# Test encode_plus for pretokenized inputs for key in output_p.keys():
output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs) self.assertEqual(output_p[key], output_r[key])
output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
for key in output_p.keys(): # Test batch_encode_plus for pretokenized inputs
self.assertEqual(output_p[key], output_r[key]) input_batch = ([pretokenized_input_simple] * 2) + [pretokenized_input_simple + pretokenized_input_pair]
output_r = tokenizer_r.batch_encode_plus(input_batch, **batch_kwargs)
# Test batch_encode_plus for pretokenized inputs output_p = tokenizer_p.batch_encode_plus(input_batch, **batch_kwargs)
input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [ for key in output_p.keys():
pretokenized_input_simple + pretokenized_input_pair, self.assertEqual(output_p[key], output_r[key])
pretokenized_input_pair,
] # Test encode for pretokenized inputs pairs
output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs) output_r = tokenizer_r.encode(
output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs) pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
for key in output_p.keys(): )
self.assertEqual(output_p[key], output_r[key]) output_p = tokenizer_p.encode(
pretokenized_input_simple, pretokenized_input_pair, is_split_into_words=True
def assert_create_token_type_ids(self, tokenizer_r, tokenizer_p): )
input_simple = [1, 2, 3] self.assertEqual(output_p, output_r)
input_pair = [1, 2, 3]
# Test encode_plus for pretokenized inputs
# Generate output output_r = tokenizer_r.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple) output_p = tokenizer_p.encode_plus(pretokenized_input_simple, pretokenized_input_pair, **kwargs)
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple) for key in output_p.keys():
self.assertEqual(output_p, output_r) self.assertEqual(output_p[key], output_r[key])
# Generate pair output # Test batch_encode_plus for pretokenized inputs
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair) input_batch_pair = ([pretokenized_input_simple, pretokenized_input_pair] * 2) + [
output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair) pretokenized_input_simple + pretokenized_input_pair,
self.assertEqual(output_p, output_r) pretokenized_input_pair,
]
def assert_build_inputs_with_special_tokens(self, tokenizer_r, tokenizer_p): output_r = tokenizer_r.batch_encode_plus(input_batch_pair, **batch_kwargs)
# Input string output_p = tokenizer_p.batch_encode_plus(input_batch_pair, **batch_kwargs)
input_simple = tokenizer_p.tokenize("This is a sample input") for key in output_p.keys():
input_pair = tokenizer_p.tokenize("This is a sample pair") self.assertEqual(output_p[key], output_r[key])
# Generate output def test_create_token_type_ids(self):
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) for tok_case, pretrained_name, kwargs in self.tokenizers_list:
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
self.assertEqual(output_p, output_r) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Generate pair output input_simple = [1, 2, 3]
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) input_pair = [1, 2, 3]
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
self.assertEqual(output_p, output_r) # Generate output
output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple)
# Input tokens id output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple)
input_simple = tokenizer_p.encode("This is a sample input") self.assertEqual(output_p, output_r)
input_pair = tokenizer_p.encode("This is a sample pair")
# Generate pair output
# Generate output output_r = tokenizer_r.create_token_type_ids_from_sequences(input_simple, input_pair)
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple) output_p = tokenizer_p.create_token_type_ids_from_sequences(input_simple, input_pair)
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple) self.assertEqual(output_p, output_r)
self.assertEqual(output_p, output_r)
def test_build_inputs_with_special_tokens(self):
# Generate pair output for tok_case, pretrained_name, kwargs in self.tokenizers_list:
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
self.assertEqual(output_p, output_r) tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# # Input string
def assert_padding(self, tokenizer_r, tokenizer_p, max_length=15): # input_simple = tokenizer_p.tokenize("This is a sample input", add_special_tokens=False)
def assert_padded_input_match(input_r: list, input_p: list, max_length: int): # input_pair = tokenizer_p.tokenize("This is a sample pair", add_special_tokens=False)
# Ensure we match max_length # # Generate output
self.assertEqual(len(input_r), max_length) # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
self.assertEqual(len(input_p), max_length) # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
# self.assertEqual(output_p, output_r)
# Ensure the number of padded tokens is the same
padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r))) # # Generate pair output
padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p))) # output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
self.assertSequenceEqual(padded_tokens_r, padded_tokens_p) # output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
# self.assertEqual(output_p, output_r)
def assert_batch_padded_input_match(input_r: dict, input_p: dict, max_length: int):
for i_r in input_r.values(): # Input tokens id
self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual( input_simple = tokenizer_p.encode("This is a sample input", add_special_tokens=False)
len(i_r[1]), max_length input_pair = tokenizer_p.encode("This is a sample pair", add_special_tokens=False)
)
self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual( # Generate output
len(i_r[1]), max_length output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
) output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
self.assertEqual(output_p, output_r)
for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]):
assert_padded_input_match(i_r, i_p, max_length) # Generate pair output
output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]): output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
self.assertSequenceEqual(i_r, i_p) self.assertEqual(output_p, output_r)
# Encode - Simple input def test_padding(self, max_length=50):
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) for tok_case, pretrained_name, kwargs in self.tokenizers_list:
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
assert_padded_input_match(input_r, input_p, max_length) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length") tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
assert_padded_input_match(input_r, input_p, max_length) def assert_padded_input_match(input_r: list, input_p: list, max_length: int):
input_r = tokenizer_r.encode("This is a simple input", padding="longest") # Ensure we match max_length
input_p = tokenizer_p.encode("This is a simple input", padding=True) self.assertEqual(len(input_r), max_length)
assert_padded_input_match(input_r, input_p, len(input_r)) self.assertEqual(len(input_p), max_length)
# Encode - Pair input # Ensure the number of padded tokens is the same
input_r = tokenizer_r.encode( padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r)))
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p)))
) self.assertSequenceEqual(padded_tokens_r, padded_tokens_p)
input_p = tokenizer_p.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True def assert_batch_padded_input_match(input_r: dict, input_p: dict, max_length: int):
) for i_r in input_r.values():
assert_padded_input_match(input_r, input_p, max_length) self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
input_r = tokenizer_r.encode( len(i_r[1]), max_length
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length" )
) self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), max_length), self.assertEqual(
input_p = tokenizer_p.encode( len(i_r[1]), max_length
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length" )
)
assert_padded_input_match(input_r, input_p, max_length) for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]):
input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True) assert_padded_input_match(i_r, i_p, max_length)
input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
assert_padded_input_match(input_r, input_p, len(input_r)) for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]):
self.assertSequenceEqual(i_r, i_p)
# Encode_plus - Simple input
input_r = tokenizer_r.encode_plus("This is a simple input", max_length=max_length, pad_to_max_length=True) # Encode - Simple input
input_p = tokenizer_p.encode_plus("This is a simple input", max_length=max_length, pad_to_max_length=True) input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) assert_padded_input_match(input_r, input_p, max_length)
input_r = tokenizer_r.encode_plus("This is a simple input", max_length=max_length, padding="max_length") input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
input_p = tokenizer_p.encode_plus("This is a simple input", max_length=max_length, padding="max_length") input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) assert_padded_input_match(input_r, input_p, max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
input_r = tokenizer_r.encode("This is a simple input", padding="longest")
input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest") input_p = tokenizer_p.encode("This is a simple input", padding=True)
input_p = tokenizer_p.encode_plus("This is a simple input", padding=True) assert_padded_input_match(input_r, input_p, len(input_r))
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
# Encode - Pair input
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_r = tokenizer_r.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
# Encode_plus - Pair input )
input_r = tokenizer_r.encode_plus( input_p = tokenizer_p.encode(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
) )
input_p = tokenizer_p.encode_plus( assert_padded_input_match(input_r, input_p, max_length)
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True input_r = tokenizer_r.encode(
) "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) )
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_p = tokenizer_p.encode(
input_r = tokenizer_r.encode_plus( "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length" )
) assert_padded_input_match(input_r, input_p, max_length)
input_p = tokenizer_p.encode_plus( input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True)
"This is a simple input", "This is a pair", max_length=max_length, padding="max_length" input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
) assert_padded_input_match(input_r, input_p, len(input_r))
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) # Encode_plus - Simple input
input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest") input_r = tokenizer_r.encode_plus(
input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True) "This is a simple input", max_length=max_length, pad_to_max_length=True
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"])) )
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"]) input_p = tokenizer_p.encode_plus(
"This is a simple input", max_length=max_length, pad_to_max_length=True
# Batch_encode_plus - Simple input )
input_r = tokenizer_r.batch_encode_plus( assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
["This is a simple input 1", "This is a simple input 2"], max_length=max_length, pad_to_max_length=True self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
) input_r = tokenizer_r.encode_plus(
input_p = tokenizer_p.batch_encode_plus( "This is a simple input", max_length=max_length, padding="max_length"
["This is a simple input 1", "This is a simple input 2"], max_length=max_length, pad_to_max_length=True )
) input_p = tokenizer_p.encode_plus(
assert_batch_padded_input_match(input_r, input_p, max_length) "This is a simple input", max_length=max_length, padding="max_length"
)
input_r = tokenizer_r.batch_encode_plus( assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
["This is a simple input 1", "This is a simple input 2"], self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
max_length=max_length,
padding="max_length", input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest")
) input_p = tokenizer_p.encode_plus("This is a simple input", padding=True)
input_p = tokenizer_p.batch_encode_plus( assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
["This is a simple input 1", "This is a simple input 2"],
max_length=max_length, self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
padding="max_length",
) # Encode_plus - Pair input
assert_batch_padded_input_match(input_r, input_p, max_length) input_r = tokenizer_r.encode_plus(
"This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
input_r = tokenizer_r.batch_encode_plus( )
["This is a simple input 1", "This is a simple input 2"], input_p = tokenizer_p.encode_plus(
max_length=max_length, "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
padding="longest", )
) assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
input_p = tokenizer_p.batch_encode_plus( self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
["This is a simple input 1", "This is a simple input 2"], input_r = tokenizer_r.encode_plus(
max_length=max_length, "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
padding=True, )
) input_p = tokenizer_p.encode_plus(
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
)
input_r = tokenizer_r.batch_encode_plus( assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
["This is a simple input 1", "This is a simple input 2"], padding="longest" self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
) input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest")
input_p = tokenizer_p.batch_encode_plus(["This is a simple input 1", "This is a simple input 2"], padding=True) input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
# Batch_encode_plus - Pair input
input_r = tokenizer_r.batch_encode_plus( # Batch_encode_plus - Simple input
[ input_r = tokenizer_r.batch_encode_plus(
("This is a simple input 1", "This is a simple input 2"), ["This is a simple input 1", "This is a simple input 2"],
("This is a simple pair 1", "This is a simple pair 2"), max_length=max_length,
], pad_to_max_length=True,
max_length=max_length, )
truncation=True, input_p = tokenizer_p.batch_encode_plus(
padding="max_length", ["This is a simple input 1", "This is a simple input 2"],
) max_length=max_length,
input_p = tokenizer_p.batch_encode_plus( pad_to_max_length=True,
[ )
("This is a simple input 1", "This is a simple input 2"), assert_batch_padded_input_match(input_r, input_p, max_length)
("This is a simple pair 1", "This is a simple pair 2"),
], input_r = tokenizer_r.batch_encode_plus(
max_length=max_length, ["This is a simple input 1", "This is a simple input 2"],
truncation=True, max_length=max_length,
padding="max_length", padding="max_length",
) )
assert_batch_padded_input_match(input_r, input_p, max_length) input_p = tokenizer_p.batch_encode_plus(
["This is a simple input 1", "This is a simple input 2"],
input_r = tokenizer_r.batch_encode_plus( max_length=max_length,
[ padding="max_length",
("This is a simple input 1", "This is a simple input 2"), )
("This is a simple pair 1", "This is a simple pair 2"), assert_batch_padded_input_match(input_r, input_p, max_length)
],
padding=True, input_r = tokenizer_r.batch_encode_plus(
) ["This is a simple input 1", "This is a simple input 2"],
input_p = tokenizer_p.batch_encode_plus( max_length=max_length,
[ padding="longest",
("This is a simple input 1", "This is a simple input 2"), )
("This is a simple pair 1", "This is a simple pair 2"), input_p = tokenizer_p.batch_encode_plus(
], ["This is a simple input 1", "This is a simple input 2"],
padding="longest", max_length=max_length,
) padding=True,
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0])) )
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Using pad on single examples after tokenization
input_r = tokenizer_r.encode_plus("This is a input 1") input_r = tokenizer_r.batch_encode_plus(
input_r = tokenizer_r.pad(input_r) ["This is a simple input 1", "This is a simple input 2"], padding="longest"
)
input_p = tokenizer_r.encode_plus("This is a input 1") input_p = tokenizer_p.batch_encode_plus(
input_p = tokenizer_r.pad(input_p) ["This is a simple input 1", "This is a simple input 2"], padding=True
)
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"])) assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Using pad on single examples after tokenization # Batch_encode_plus - Pair input
input_r = tokenizer_r.encode_plus("This is a input 1") input_r = tokenizer_r.batch_encode_plus(
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") [
("This is a simple input 1", "This is a simple input 2"),
input_p = tokenizer_r.encode_plus("This is a input 1") ("This is a simple pair 1", "This is a simple pair 2"),
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") ],
max_length=max_length,
assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length) truncation=True,
padding="max_length",
# Using pad after tokenization )
input_r = tokenizer_r.batch_encode_plus( input_p = tokenizer_p.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"] [
) ("This is a simple input 1", "This is a simple input 2"),
input_r = tokenizer_r.pad(input_r) ("This is a simple pair 1", "This is a simple pair 2"),
],
input_p = tokenizer_r.batch_encode_plus( max_length=max_length,
["This is a input 1", "This is a much longer input whilch should be padded"] truncation=True,
) padding="max_length",
input_p = tokenizer_r.pad(input_p) )
assert_batch_padded_input_match(input_r, input_p, max_length)
assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
input_r = tokenizer_r.batch_encode_plus(
# Using pad after tokenization [
input_r = tokenizer_r.batch_encode_plus( ("This is a simple input 1", "This is a simple input 2"),
["This is a input 1", "This is a much longer input whilch should be padded"] ("This is a simple pair 1", "This is a simple pair 2"),
) ],
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length") padding=True,
)
input_p = tokenizer_r.batch_encode_plus( input_p = tokenizer_p.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"] [
) ("This is a simple input 1", "This is a simple input 2"),
input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length") ("This is a simple pair 1", "This is a simple pair 2"),
],
assert_batch_padded_input_match(input_r, input_p, max_length) padding="longest",
)
def assert_save_pretrained(self, tokenizer_r, tokenizer_p): assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r.save_vocabulary("."), tokenizer_p.save_vocabulary(".")) # Using pad on single examples after tokenization
input_r = tokenizer_r.encode_plus("This is a input 1")
# Checks everything loads correctly in the same way input_r = tokenizer_r.pad(input_r)
tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained("."), tokenizer_p.from_pretrained(".")
input_p = tokenizer_r.encode_plus("This is a input 1")
# Check special tokens are set accordingly on Rust and Python input_p = tokenizer_r.pad(input_p)
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key)) assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id")) # Using pad on single examples after tokenization
input_r = tokenizer_r.encode_plus("This is a input 1")
def assert_embeded_special_tokens(self, tokenizer_r, tokenizer_p): input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus( input_p = tokenizer_r.encode_plus("This is a input 1")
sentence, add_special_tokens=True, return_attention_mask=False, return_token_type_ids=True input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
)
tokens_p = tokenizer_p.encode_plus( assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
sentence, add_special_tokens=True, return_attention_mask=False, return_token_type_ids=True
) # Using pad after tokenization
input_r = tokenizer_r.batch_encode_plus(
for key in tokens_p.keys(): ["This is a input 1", "This is a much longer input whilch should be padded"]
self.assertEqual(tokens_r[key], tokens_p[key]) )
input_r = tokenizer_r.pad(input_r)
self.assertEqual(sum(tokens_r["token_type_ids"]), 0)
self.assertEqual(sum(tokens_p["token_type_ids"]), 0) input_p = tokenizer_r.batch_encode_plus(
["This is a input 1", "This is a much longer input whilch should be padded"]
tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
self.assertSequenceEqual(tokens_r, tokens_p)
def assert_add_special_tokens(self, tokenizer_r):
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
for text in ["", " "]:
# tokenize()
no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
# encode()
no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
# encode_plus()
no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
for key in no_special_tokens.keys():
self.assertEqual(
len(no_special_tokens[key]), len(with_special_tokens[key]) - simple_num_special_tokens_to_add
) )
input_p = tokenizer_r.pad(input_p)
# # batch_encode_plus assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]))
no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True) # Using pad after tokenization
for key in no_special_tokens.keys(): input_r = tokenizer_r.batch_encode_plus(
for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]): ["This is a input 1", "This is a much longer input whilch should be padded"]
self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add) )
input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
def assert_prepare_for_model(self, tokenizer_r, tokenizer_p): input_p = tokenizer_r.batch_encode_plus(
string_sequence = "Asserting that both tokenizers are equal" ["This is a input 1", "This is a much longer input whilch should be padded"]
python_output = tokenizer_p.prepare_for_model(tokenizer_p.encode(string_sequence)) )
rust_output = tokenizer_r.prepare_for_model(tokenizer_r.encode(string_sequence)) input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
self.assertEqual(python_output, rust_output)
assert_batch_padded_input_match(input_r, input_p, max_length)
def test_save_pretrained(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
# Checks it save with the same files
self.assertSequenceEqual(
tokenizer_r.save_vocabulary(self.tmpdirname), tokenizer_p.save_vocabulary(self.tmpdirname)
)
# Checks everything loads correctly in the same way
tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained(self.tmpdirname), tokenizer_p.from_pretrained(
self.tmpdirname
)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
def test_embeded_special_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(
sentence,
add_special_tokens=True,
)
tokens_p = tokenizer_p.encode_plus(
sentence,
add_special_tokens=True,
)
for key in tokens_p.keys():
self.assertEqual(tokens_r[key], tokens_p[key])
if "token_type_ids" in tokens_r:
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
self.assertSequenceEqual(tokens_r, tokens_p)
def test_add_special_tokens(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
# pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
for text in ["", " "]:
# tokenize()
no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
self.assertEqual(
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
)
# encode()
no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
self.assertEqual(
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
)
# encode_plus()
no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
for key in no_special_tokens.keys():
self.assertEqual(
len(no_special_tokens[key]),
len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
)
# # batch_encode_plus
no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
for key in no_special_tokens.keys():
for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
def test_prepare_for_model(self):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
string_sequence = "Asserting that both tokenizers are equal"
python_output = tokenizer_p.prepare_for_model(
tokenizer_p.encode(string_sequence, add_special_tokens=False)
)
rust_output = tokenizer_r.prepare_for_model(
tokenizer_r.encode(string_sequence, add_special_tokens=False)
)
for key in python_output:
self.assertEqual(python_output[key], rust_output[key])
class WordPieceFastTokenizerTest(CommonFastTokenizerTest): class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
...@@ -733,61 +856,86 @@ class WordPieceFastTokenizerTest(CommonFastTokenizerTest): ...@@ -733,61 +856,86 @@ class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
Tokenizer( Tokenizer(
"DistilBert", DistilBertTokenizerFast, DistilBertTokenizer, "vocab_file", filter_non_english, None "DistilBert", DistilBertTokenizerFast, DistilBertTokenizer, "vocab_file", filter_non_english, None
), ),
Tokenizer(
"DPRReaderTokenizer",
DPRReaderTokenizerFast,
DPRReaderTokenizer,
"vocab_file",
filter_non_english,
None,
),
Tokenizer(
"DPRQuestionEncoderTokenizer",
DPRQuestionEncoderTokenizerFast,
DPRQuestionEncoderTokenizer,
"vocab_file",
filter_non_english,
None,
),
Tokenizer(
"DPRContextEncoderTokenizer",
DPRContextEncoderTokenizerFast,
DPRContextEncoderTokenizer,
"vocab_file",
filter_non_english,
None,
),
Tokenizer("FunnelTokenizer", FunnelTokenizerFast, FunnelTokenizer, "vocab_file", filter_non_english, None),
Tokenizer("LxmertTokenizer", LxmertTokenizerFast, LxmertTokenizer, "vocab_file", filter_non_english, None),
] ]
) )
def fast_only(self, tokenizer_r): def test_offsets_with_special_characters(self):
super().fast_only(tokenizer_r) for tok_case, pretrained_name, kwargs in self.tokenizers_list:
self.assert_offsets_with_special_characters(tokenizer_r) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
def assert_add_special_tokens(self, tokenizer_r):
super().assert_add_special_tokens(tokenizer_r) sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens = tokenizer_r.encode_plus(
def assert_offsets_with_special_characters(self, tokenizer_r): sentence,
sentence = "A, naïve [MASK] AllenNLP sentence." return_attention_mask=False,
tokens = tokenizer_r.encode_plus( return_token_type_ids=False,
sentence, return_offsets_mapping=True,
return_attention_mask=False, add_special_tokens=True,
return_token_type_ids=False, )
return_offsets_mapping=True,
add_special_tokens=True, do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
) expected_results = (
[
do_lower_case = tokenizer_r.init_kwargs.get("do_lower_case") ((0, 0), tokenizer_r.cls_token),
expected_results = ( ((0, 1), "A"),
[ ((1, 2), ","),
((0, 0), "[CLS]"), ((3, 5), "na"),
((0, 1), "A"), ((5, 6), "##ï"),
((1, 2), ","), ((6, 8), "##ve"),
((3, 5), "na"), ((9, 15), tokenizer_r.mask_token),
((5, 6), "##ï"), ((16, 21), "Allen"),
((6, 8), "##ve"), ((21, 23), "##NL"),
((9, 15), "[MASK]"), ((23, 24), "##P"),
((16, 21), "Allen"), ((25, 33), "sentence"),
((21, 23), "##NL"), ((33, 34), "."),
((23, 24), "##P"), ((0, 0), tokenizer_r.sep_token),
((25, 33), "sentence"), ]
((33, 34), "."), if not do_lower_case
((0, 0), "[SEP]"), else [
] ((0, 0), tokenizer_r.cls_token),
if not do_lower_case ((0, 1), "a"),
else [ ((1, 2), ","),
((0, 0), "[CLS]"), ((3, 8), "naive"),
((0, 1), "a"), ((9, 15), tokenizer_r.mask_token),
((1, 2), ","), ((16, 21), "allen"),
((3, 8), "naive"), ((21, 23), "##nl"),
((9, 15), "[MASK]"), ((23, 24), "##p"),
((16, 21), "allen"), ((25, 33), "sentence"),
((21, 23), "##nl"), ((33, 34), "."),
((23, 24), "##p"), ((0, 0), tokenizer_r.sep_token),
((25, 33), "sentence"), ]
((33, 34), "."), )
((0, 0), "[SEP]"),
] self.assertEqual(
) [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
)
self.assertEqual([e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])) self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
class RobertaFastTokenizerTest(CommonFastTokenizerTest): class RobertaFastTokenizerTest(CommonFastTokenizerTest):
...@@ -800,32 +948,52 @@ class RobertaFastTokenizerTest(CommonFastTokenizerTest): ...@@ -800,32 +948,52 @@ class RobertaFastTokenizerTest(CommonFastTokenizerTest):
"vocab_file", "vocab_file",
filter_roberta_detectors, filter_roberta_detectors,
(("cls_token", "<s>"),), (("cls_token", "<s>"),),
) ),
Tokenizer(
"Bart",
BartTokenizerFast,
BartTokenizer,
"vocab_file",
None,
None,
),
] ]
) )
def assert_embeded_special_tokens(self, tokenizer_r, tokenizer_p): def test_pretokenized_inputs(self):
sentence = "A, <mask> AllenNLP sentence." pass
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
# Rust correctly handles the space before the mask while python doesnt def test_embeded_special_tokens(self):
self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) for tok_case, pretrained_name, kwargs in self.tokenizers_list:
self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2]) with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name, **kwargs)
sentence = "A, <mask> AllenNLP sentence."
tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
# token_type_ids should put 0 everywhere
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
# attention_mask should put 1 everywhere, so sum over length should be 1
self.assertEqual(
sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
)
# token_type_ids should put 0 everywhere tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"])) tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
# attention_mask should put 1 everywhere, so sum over length should be 1 # Rust correctly handles the space before the mask while python doesnt
self.assertEqual( self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]), self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
)
tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"]) self.assertSequenceEqual(
tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"]) tokens_p_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
self.assertSequenceEqual(tokens_r, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]) )
self.assertSequenceEqual(tokens_p, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]) self.assertSequenceEqual(
tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
)
class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest): class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
...@@ -834,62 +1002,75 @@ class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest): ...@@ -834,62 +1002,75 @@ class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
Tokenizer("GPT2", GPT2TokenizerFast, GPT2Tokenizer, "vocab_file", None, [("add_prefix_space", True)]), Tokenizer("GPT2", GPT2TokenizerFast, GPT2Tokenizer, "vocab_file", None, [("add_prefix_space", True)]),
] ]
def fast_align_python(self, tokenizer_r, tokenizer_p, tok_case, pretrained_name): def test_pretokenized_inputs(self):
# Check is_fast is set correctly pass
self.assertFalse(tokenizer_p.is_fast)
self.assertTrue(tokenizer_r.is_fast) def test_padding(self, max_length=15):
for tok_case, pretrained_name, kwargs in self.tokenizers_list:
# Check that Rust and Python align with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
self.assert_tokenization_python_rust_equals(tokenizer_r, tokenizer_p) tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs)
self.assert_num_special_tokens_to_add_equal(tokenizer_r, tokenizer_p)
self.assert_max_length_equal(tokenizer_r, tokenizer_p) # Simple input
self.assert_special_tokens_map_equal(tokenizer_r, tokenizer_p) s = "This is a simple input"
self.assert_embeded_special_tokens(tokenizer_r, tokenizer_p) s2 = ["This is a simple input 1", "This is a simple input 2"]
self.assert_padding(tokenizer_r, tokenizer_p) p = ("This is a simple input", "This is a pair")
p2 = [
# Specific for ("This is a simple input 1", "This is a simple input 2"),
kwargs = {} ("This is a simple pair 1", "This is a simple pair 2"),
if tok_case.kwargs is not None: ]
kwargs = dict(tok_case.kwargs)
tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name, **kwargs) # Simple input tests
self.assert_pretokenized_inputs(tokenizer_r, tokenizer_p) self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
def assert_padding(self, tokenizer_r, tokenizer_p, max_length=15): # Simple input
# Simple input self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
s = "This is a simple input"
s2 = ["This is a simple input 1", "This is a simple input 2"] # Simple input
p = ("This is a simple input", "This is a pair") self.assertRaises(
p2 = [ ValueError,
("This is a simple input 1", "This is a simple input 2"), tokenizer_r.batch_encode_plus,
("This is a simple pair 1", "This is a simple pair 2"), s2,
] max_length=max_length,
padding="max_length",
)
# Pair input
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
# Simple input tests # Pair input
self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length") self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
# Simple input # Pair input
self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length") self.assertRaises(
ValueError,
# Simple input tokenizer_r.batch_encode_plus,
self.assertRaises( p2,
ValueError, max_length=max_length,
tokenizer_r.batch_encode_plus, padding="max_length",
s2, )
max_length=max_length,
padding="max_length",
) class SentencePieceFastTokenizerTest(CommonFastTokenizerTest):
"""
# Pair input Override specific methods to test SentencePiece behavior
self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length") """
# Pair input TOKENIZERS_CLASSES = frozenset(
self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length") [
Tokenizer("Albert", AlbertTokenizerFast, AlbertTokenizer, "vocab_file", None, None),
# Pair input Tokenizer("Camembert", CamembertTokenizerFast, CamembertTokenizer, "vocab_file", None, None),
self.assertRaises( Tokenizer("T5", T5TokenizerFast, T5Tokenizer, "vocab_file", None, None),
ValueError, Tokenizer(
tokenizer_r.batch_encode_plus, "MBart",
p2, MBartTokenizerFast,
max_length=max_length, MBartTokenizer,
padding="max_length", "vocab_file",
) None,
None,
),
Tokenizer("Pegasus", PegasusTokenizerFast, PegasusTokenizer, "vocab_file", None, None),
Tokenizer("Reformer", ReformerTokenizerFast, ReformerTokenizer, "vocab_file", None, None),
Tokenizer("XLMRoberta", XLMRobertaTokenizerFast, XLMRobertaTokenizer, "vocab_file", None, None),
Tokenizer("XLNet", XLNetTokenizerFast, XLNetTokenizer, "vocab_file", None, None),
]
)
...@@ -26,6 +26,7 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -26,6 +26,7 @@ class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = FunnelTokenizer tokenizer_class = FunnelTokenizer
test_rust_tokenizer = True test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -26,6 +26,7 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -26,6 +26,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = GPT2Tokenizer tokenizer_class = GPT2Tokenizer
rust_tokenizer_class = GPT2TokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
def setUp(self): def setUp(self):
......
...@@ -18,7 +18,7 @@ import os ...@@ -18,7 +18,7 @@ import os
import unittest import unittest
from transformers.tokenization_bert import VOCAB_FILES_NAMES from transformers.tokenization_bert import VOCAB_FILES_NAMES
from transformers.tokenization_lxmert import LxmertTokenizer from transformers.tokenization_lxmert import LxmertTokenizer, LxmertTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -26,6 +26,9 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -26,6 +26,9 @@ from .test_tokenization_common import TokenizerTesterMixin
class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = LxmertTokenizer tokenizer_class = LxmertTokenizer
rust_tokenizer_class = LxmertTokenizerFast
test_rust_tokenizer = True
space_between_special_tokens = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -49,9 +52,6 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -49,9 +52,6 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
return LxmertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "UNwant\u00E9d,running" input_text = "UNwant\u00E9d,running"
output_text = "unwanted, running" output_text = "unwanted, running"
...@@ -63,3 +63,25 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -63,3 +63,25 @@ class LxmertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokens = tokenizer.tokenize("UNwant\u00E9d,running") tokens = tokenizer.tokenize("UNwant\u00E9d,running")
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
...@@ -38,6 +38,7 @@ FRAMEWORK = "pt" if _torch_available else "tf" ...@@ -38,6 +38,7 @@ FRAMEWORK = "pt" if _torch_available else "tf"
class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MarianTokenizer tokenizer_class = MarianTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
import tempfile import tempfile
import unittest import unittest
from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, is_torch_available from transformers import AutoTokenizer, BatchEncoding, MBartTokenizer, MBartTokenizerFast, is_torch_available
from transformers.testing_utils import require_torch from transformers.testing_utils import require_torch
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -17,6 +17,8 @@ RO_CODE = 250020 ...@@ -17,6 +17,8 @@ RO_CODE = 250020
class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MBartTokenizer tokenizer_class = MBartTokenizer
rust_tokenizer_class = MBartTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -18,7 +18,7 @@ import json ...@@ -18,7 +18,7 @@ import json
import os import os
import unittest import unittest
from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = OpenAIGPTTokenizer tokenizer_class = OpenAIGPTTokenizer
rust_tokenizer_class = OpenAIGPTTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -3,7 +3,7 @@ from pathlib import Path ...@@ -3,7 +3,7 @@ from pathlib import Path
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import require_torch from transformers.testing_utils import require_torch
from transformers.tokenization_pegasus import PegasusTokenizer from transformers.tokenization_pegasus import PegasusTokenizer, PegasusTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -11,6 +11,8 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -11,6 +11,8 @@ from .test_tokenization_common import TokenizerTesterMixin
class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PegasusTokenizer tokenizer_class = PegasusTokenizer
rust_tokenizer_class = PegasusTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -19,7 +19,7 @@ import unittest ...@@ -19,7 +19,7 @@ import unittest
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import require_torch, slow from transformers.testing_utils import require_torch, slow
from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer from transformers.tokenization_reformer import SPIECE_UNDERLINE, ReformerTokenizer, ReformerTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -30,6 +30,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture ...@@ -30,6 +30,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = ReformerTokenizer tokenizer_class = ReformerTokenizer
rust_tokenizer_class = ReformerTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -37,6 +39,28 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -37,6 +39,28 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
def test_full_tokenizer(self): def test_full_tokenizer(self):
tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
......
...@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -26,6 +26,8 @@ from .test_tokenization_common import TokenizerTesterMixin
class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = RobertaTokenizer tokenizer_class = RobertaTokenizer
rust_tokenizer_class = RobertaTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -20,13 +20,12 @@ import unittest ...@@ -20,13 +20,12 @@ import unittest
from transformers import BatchEncoding from transformers import BatchEncoding
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import _torch_available from transformers.testing_utils import _torch_available
from transformers.tokenization_t5 import T5Tokenizer from transformers.tokenization_t5 import T5Tokenizer, T5TokenizerFast
from transformers.tokenization_xlnet import SPIECE_UNDERLINE
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
SPIECE_UNDERLINE = "▁"
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
FRAMEWORK = "pt" if _torch_available else "tf" FRAMEWORK = "pt" if _torch_available else "tf"
...@@ -35,6 +34,8 @@ FRAMEWORK = "pt" if _torch_available else "tf" ...@@ -35,6 +34,8 @@ FRAMEWORK = "pt" if _torch_available else "tf"
class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = T5Tokenizer tokenizer_class = T5Tokenizer
rust_tokenizer_class = T5TokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -113,6 +114,38 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -113,6 +114,38 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def t5_base_tokenizer(self): def t5_base_tokenizer(self):
return T5Tokenizer.from_pretrained("t5-base") return T5Tokenizer.from_pretrained("t5-base")
@cached_property
def t5_base_tokenizer_fast(self):
return T5TokenizerFast.from_pretrained("t5-base")
def get_tokenizer(self, **kwargs) -> T5Tokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
def get_rust_tokenizer(self, **kwargs) -> T5TokenizerFast:
return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, pad_token=None, **kwargs)
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
def test_eos_treatment(self): def test_eos_treatment(self):
tokenizer = self.t5_base_tokenizer tokenizer = self.t5_base_tokenizer
batch_with_eos_added = tokenizer(["hi</s>", "I went to the gym</s>", "</s>"]) batch_with_eos_added = tokenizer(["hi</s>", "I went to the gym</s>", "</s>"])
......
...@@ -17,20 +17,15 @@ ...@@ -17,20 +17,15 @@
import os import os
import unittest import unittest
from transformers import is_torch_available from transformers.tokenization_transfo_xl import VOCAB_FILES_NAMES, TransfoXLTokenizer
from transformers.testing_utils import require_torch
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
if is_torch_available():
from transformers.tokenization_transfo_xl import VOCAB_FILES_NAMES, TransfoXLTokenizer
@require_torch
class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = TransfoXLTokenizer if is_torch_available() else None tokenizer_class = TransfoXLTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -27,6 +27,7 @@ from .test_tokenization_common import TokenizerTesterMixin ...@@ -27,6 +27,7 @@ from .test_tokenization_common import TokenizerTesterMixin
class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMTokenizer tokenizer_class = XLMTokenizer
test_rust_tokenizer = False
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -19,7 +19,7 @@ import unittest ...@@ -19,7 +19,7 @@ import unittest
from transformers.file_utils import cached_property from transformers.file_utils import cached_property
from transformers.testing_utils import slow from transformers.testing_utils import slow
from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer from transformers.tokenization_xlm_roberta import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -30,6 +30,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture ...@@ -30,6 +30,8 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMRobertaTokenizer tokenizer_class = XLMRobertaTokenizer
rust_tokenizer_class = XLMRobertaTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -118,6 +120,28 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -118,6 +120,28 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def big_tokenizer(self): def big_tokenizer(self):
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
sequence = "I was born in 92000, and this is falsé."
tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)
ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)
rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)
@slow @slow
def test_tokenization_base_easy_symbols(self): def test_tokenization_base_easy_symbols(self):
symbols = "Hello World!" symbols = "Hello World!"
......
...@@ -18,7 +18,7 @@ import os ...@@ -18,7 +18,7 @@ import os
import unittest import unittest
from transformers.testing_utils import slow from transformers.testing_utils import slow
from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer, XLNetTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
...@@ -29,12 +29,15 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture ...@@ -29,12 +29,15 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLNetTokenizer tokenizer_class = XLNetTokenizer
rust_tokenizer_class = XLNetTokenizerFast
test_rust_tokenizer = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
# We have a SentencePiece fixture for testing # We have a SentencePiece fixture for testing
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.sanitize_special_tokens()
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)
def test_full_tokenizer(self): def test_full_tokenizer(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment