Commit 00204f2b authored by Aymeric Augustin's avatar Aymeric Augustin
Browse files

Replace CommonTestCases for tokenizers with a mixin.

This is the same change as for (TF)CommonTestCases for modeling.
parent a3c5883f
...@@ -15,14 +15,15 @@ ...@@ -15,14 +15,15 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import os import os
import unittest
from io import open from io import open
from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer from transformers.tokenization_bert import VOCAB_FILES_NAMES, XxxTokenizer
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester): class XxxTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XxxTokenizer tokenizer_class = XxxTokenizer
......
...@@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function ...@@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function
import json import json
import os import os
from .test_tokenization_commo import TemporaryDirectory from .test_tokenization_common import TemporaryDirectory
class ConfigTester(object): class ConfigTester(object):
......
...@@ -20,7 +20,7 @@ import unittest ...@@ -20,7 +20,7 @@ import unittest
from transformers.modelcard import ModelCard from transformers.modelcard import ModelCard
from .test_tokenization_commo import TemporaryDirectory from .test_tokenization_common import TemporaryDirectory
class ModelCardTester(unittest.TestCase): class ModelCardTester(unittest.TestCase):
......
...@@ -19,7 +19,7 @@ import unittest ...@@ -19,7 +19,7 @@ import unittest
from transformers import is_torch_available from transformers import is_torch_available
from .test_tokenization_commo import TemporaryDirectory from .test_tokenization_common import TemporaryDirectory
from .utils import require_torch from .utils import require_torch
......
...@@ -15,16 +15,17 @@ ...@@ -15,16 +15,17 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import os import os
import unittest
from transformers.tokenization_albert import AlbertTokenizer from transformers.tokenization_albert import AlbertTokenizer
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model") SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/spiece.model")
class AlbertTokenizationTest(CommonTestCases.CommonTokenizerTester): class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = AlbertTokenizer tokenizer_class = AlbertTokenizer
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import os import os
import unittest
from io import open from io import open
from transformers.tokenization_bert import ( from transformers.tokenization_bert import (
...@@ -27,11 +28,11 @@ from transformers.tokenization_bert import ( ...@@ -27,11 +28,11 @@ from transformers.tokenization_bert import (
_is_whitespace, _is_whitespace,
) )
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
from .utils import slow from .utils import slow
class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertTokenizer tokenizer_class = BertTokenizer
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import os import os
import unittest
from io import open from io import open
from transformers.tokenization_bert import WordpieceTokenizer from transformers.tokenization_bert import WordpieceTokenizer
...@@ -25,12 +26,12 @@ from transformers.tokenization_bert_japanese import ( ...@@ -25,12 +26,12 @@ from transformers.tokenization_bert_japanese import (
MecabTokenizer, MecabTokenizer,
) )
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
from .utils import custom_tokenizers, slow from .utils import custom_tokenizers, slow
@custom_tokenizers @custom_tokenizers
class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester): class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertJapaneseTokenizer tokenizer_class = BertJapaneseTokenizer
...@@ -130,7 +131,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -130,7 +131,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert encoded_pair == [2] + text + [3] + text_2 + [3] assert encoded_pair == [2] + text + [3] + text_2 + [3]
class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester): class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertJapaneseTokenizer tokenizer_class = BertJapaneseTokenizer
......
...@@ -18,7 +18,6 @@ import os ...@@ -18,7 +18,6 @@ import os
import shutil import shutil
import sys import sys
import tempfile import tempfile
import unittest
from io import open from io import open
...@@ -43,489 +42,479 @@ else: ...@@ -43,489 +42,479 @@ else:
unicode = str unicode = str
class CommonTestCases: class TokenizerTesterMixin:
class CommonTokenizerTester(unittest.TestCase):
tokenizer_class = None tokenizer_class = None
def setUp(self): def setUp(self):
self.tmpdirname = tempfile.mkdtemp() self.tmpdirname = tempfile.mkdtemp()
def tearDown(self): def tearDown(self):
shutil.rmtree(self.tmpdirname) shutil.rmtree(self.tmpdirname)
def get_tokenizer(self, **kwargs): def get_tokenizer(self, **kwargs):
raise NotImplementedError raise NotImplementedError
def get_input_output_texts(self): def get_input_output_texts(self):
raise NotImplementedError raise NotImplementedError
def test_tokenizers_common_properties(self): def test_tokenizers_common_properties(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
attributes_list = [ attributes_list = [
"bos_token", "bos_token",
"eos_token", "eos_token",
"unk_token", "unk_token",
"sep_token", "sep_token",
"pad_token", "pad_token",
"cls_token", "cls_token",
"mask_token", "mask_token",
] ]
for attr in attributes_list: for attr in attributes_list:
self.assertTrue(hasattr(tokenizer, attr)) self.assertTrue(hasattr(tokenizer, attr))
self.assertTrue(hasattr(tokenizer, attr + "_id")) self.assertTrue(hasattr(tokenizer, attr + "_id"))
self.assertTrue(hasattr(tokenizer, "additional_special_tokens")) self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids")) self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"] attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"]
for attr in attributes_list: for attr in attributes_list:
self.assertTrue(hasattr(tokenizer, attr)) self.assertTrue(hasattr(tokenizer, attr))
def test_save_and_load_tokenizer(self): def test_save_and_load_tokenizer(self):
# safety check on max_len default value so we are sure the test works # safety check on max_len default value so we are sure the test works
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
self.assertNotEqual(tokenizer.max_len, 42) self.assertNotEqual(tokenizer.max_len, 42)
# Now let's start the test # Now let's start the test
tokenizer = self.get_tokenizer(max_len=42) tokenizer = self.get_tokenizer(max_len=42)
before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False) before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
with TemporaryDirectory() as tmpdirname: with TemporaryDirectory() as tmpdirname:
tokenizer.save_pretrained(tmpdirname) tokenizer.save_pretrained(tmpdirname)
tokenizer = self.tokenizer_class.from_pretrained(tmpdirname) tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False) after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
self.assertListEqual(before_tokens, after_tokens) self.assertListEqual(before_tokens, after_tokens)
self.assertEqual(tokenizer.max_len, 42) self.assertEqual(tokenizer.max_len, 42)
tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43) tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
self.assertEqual(tokenizer.max_len, 43) self.assertEqual(tokenizer.max_len, 43)
def test_pickle_tokenizer(self): def test_pickle_tokenizer(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
self.assertIsNotNone(tokenizer) self.assertIsNotNone(tokenizer)
text = "Munich and Berlin are nice cities" text = "Munich and Berlin are nice cities"
subwords = tokenizer.tokenize(text) subwords = tokenizer.tokenize(text)
with TemporaryDirectory() as tmpdirname: with TemporaryDirectory() as tmpdirname:
filename = os.path.join(tmpdirname, "tokenizer.bin") filename = os.path.join(tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle: with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle) pickle.dump(tokenizer, handle)
with open(filename, "rb") as handle: with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle) tokenizer_new = pickle.load(handle)
subwords_loaded = tokenizer_new.tokenize(text) subwords_loaded = tokenizer_new.tokenize(text)
self.assertListEqual(subwords, subwords_loaded) self.assertListEqual(subwords, subwords_loaded)
def test_added_tokens_do_lower_case(self): def test_added_tokens_do_lower_case(self):
tokenizer = self.get_tokenizer(do_lower_case=True) tokenizer = self.get_tokenizer(do_lower_case=True)
special_token = tokenizer.all_special_tokens[0] special_token = tokenizer.all_special_tokens[0]
text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token
toks0 = tokenizer.tokenize(text) # toks before adding new_toks toks0 = tokenizer.tokenize(text) # toks before adding new_toks
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"] new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
added = tokenizer.add_tokens(new_toks) added = tokenizer.add_tokens(new_toks)
self.assertEqual(added, 2) self.assertEqual(added, 2)
toks = tokenizer.tokenize(text) toks = tokenizer.tokenize(text)
toks2 = tokenizer.tokenize(text2) toks2 = tokenizer.tokenize(text2)
self.assertEqual(len(toks), len(toks2)) self.assertEqual(len(toks), len(toks2))
self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer
self.assertListEqual(toks, toks2) self.assertListEqual(toks, toks2)
# Check that none of the special tokens are lowercased # Check that none of the special tokens are lowercased
sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B" sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens) tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens)
for special_token in tokenizer.all_special_tokens: for special_token in tokenizer.all_special_tokens:
self.assertTrue(special_token in tokenized_sequence) self.assertTrue(special_token in tokenized_sequence)
tokenizer = self.get_tokenizer(do_lower_case=False) tokenizer = self.get_tokenizer(do_lower_case=False)
added = tokenizer.add_tokens(new_toks) added = tokenizer.add_tokens(new_toks)
self.assertEqual(added, 4) self.assertEqual(added, 4)
toks = tokenizer.tokenize(text) toks = tokenizer.tokenize(text)
toks2 = tokenizer.tokenize(text2) toks2 = tokenizer.tokenize(text2)
self.assertEqual(len(toks), len(toks2)) # Length should still be the same self.assertEqual(len(toks), len(toks2)) # Length should still be the same
self.assertNotEqual(len(toks), len(toks0)) self.assertNotEqual(len(toks), len(toks0))
self.assertNotEqual(toks[1], toks2[1]) # But at least the first non-special tokens should differ self.assertNotEqual(toks[1], toks2[1]) # But at least the first non-special tokens should differ
def test_add_tokens_tokenizer(self): def test_add_tokens_tokenizer(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
all_size = len(tokenizer) all_size = len(tokenizer)
self.assertNotEqual(vocab_size, 0) self.assertNotEqual(vocab_size, 0)
self.assertEqual(vocab_size, all_size) self.assertEqual(vocab_size, all_size)
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"] new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
added_toks = tokenizer.add_tokens(new_toks) added_toks = tokenizer.add_tokens(new_toks)
vocab_size_2 = tokenizer.vocab_size vocab_size_2 = tokenizer.vocab_size
all_size_2 = len(tokenizer) all_size_2 = len(tokenizer)
self.assertNotEqual(vocab_size_2, 0) self.assertNotEqual(vocab_size_2, 0)
self.assertEqual(vocab_size, vocab_size_2) self.assertEqual(vocab_size, vocab_size_2)
self.assertEqual(added_toks, len(new_toks)) self.assertEqual(added_toks, len(new_toks))
self.assertEqual(all_size_2, all_size + len(new_toks)) self.assertEqual(all_size_2, all_size + len(new_toks))
tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False) tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
out_string = tokenizer.decode(tokens) out_string = tokenizer.decode(tokens)
self.assertGreaterEqual(len(tokens), 4) self.assertGreaterEqual(len(tokens), 4)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"} new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
added_toks_2 = tokenizer.add_special_tokens(new_toks_2) added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer) all_size_3 = len(tokenizer)
self.assertNotEqual(vocab_size_3, 0) self.assertNotEqual(vocab_size_3, 0)
self.assertEqual(vocab_size, vocab_size_3) self.assertEqual(vocab_size, vocab_size_3)
self.assertEqual(added_toks_2, len(new_toks_2)) self.assertEqual(added_toks_2, len(new_toks_2))
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tokens = tokenizer.encode( tokens = tokenizer.encode(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
) )
out_string = tokenizer.decode(tokens) out_string = tokenizer.decode(tokens)
self.assertGreaterEqual(len(tokens), 6) self.assertGreaterEqual(len(tokens), 6)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[0], tokens[1]) self.assertGreater(tokens[0], tokens[1])
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokens[-3]) self.assertGreater(tokens[-2], tokens[-3])
self.assertEqual(tokens[0], tokenizer.eos_token_id) self.assertEqual(tokens[0], tokenizer.eos_token_id)
self.assertEqual(tokens[-2], tokenizer.pad_token_id) self.assertEqual(tokens[-2], tokenizer.pad_token_id)
def test_add_special_tokens(self): def test_add_special_tokens(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
input_text, output_text = self.get_input_output_texts() input_text, output_text = self.get_input_output_texts()
special_token = "[SPECIAL TOKEN]" special_token = "[SPECIAL TOKEN]"
tokenizer.add_special_tokens({"cls_token": special_token}) tokenizer.add_special_tokens({"cls_token": special_token})
encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False) encoded_special_token = tokenizer.encode(special_token, add_special_tokens=False)
assert len(encoded_special_token) == 1 assert len(encoded_special_token) == 1
text = " ".join([input_text, special_token, output_text]) text = " ".join([input_text, special_token, output_text])
encoded = tokenizer.encode(text, add_special_tokens=False) encoded = tokenizer.encode(text, add_special_tokens=False)
input_encoded = tokenizer.encode(input_text, add_special_tokens=False) input_encoded = tokenizer.encode(input_text, add_special_tokens=False)
output_encoded = tokenizer.encode(output_text, add_special_tokens=False) output_encoded = tokenizer.encode(output_text, add_special_tokens=False)
special_token_id = tokenizer.encode(special_token, add_special_tokens=False) special_token_id = tokenizer.encode(special_token, add_special_tokens=False)
assert encoded == input_encoded + special_token_id + output_encoded assert encoded == input_encoded + special_token_id + output_encoded
decoded = tokenizer.decode(encoded, skip_special_tokens=True) decoded = tokenizer.decode(encoded, skip_special_tokens=True)
assert special_token not in decoded assert special_token not in decoded
def test_required_methods_tokenizer(self): def test_required_methods_tokenizer(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
input_text, output_text = self.get_input_output_texts() input_text, output_text = self.get_input_output_texts()
tokens = tokenizer.tokenize(input_text) tokens = tokenizer.tokenize(input_text)
ids = tokenizer.convert_tokens_to_ids(tokens) ids = tokenizer.convert_tokens_to_ids(tokens)
ids_2 = tokenizer.encode(input_text, add_special_tokens=False) ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
self.assertListEqual(ids, ids_2) self.assertListEqual(ids, ids_2)
tokens_2 = tokenizer.convert_ids_to_tokens(ids) tokens_2 = tokenizer.convert_ids_to_tokens(ids)
text_2 = tokenizer.decode(ids) text_2 = tokenizer.decode(ids)
self.assertEqual(text_2, output_text) self.assertEqual(text_2, output_text)
self.assertNotEqual(len(tokens_2), 0) self.assertNotEqual(len(tokens_2), 0)
self.assertIsInstance(text_2, (str, unicode)) self.assertIsInstance(text_2, (str, unicode))
def test_encode_decode_with_spaces(self): def test_encode_decode_with_spaces(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
new_toks = ["[ABC]", "[DEF]", "GHI IHG"] new_toks = ["[ABC]", "[DEF]", "GHI IHG"]
tokenizer.add_tokens(new_toks) tokenizer.add_tokens(new_toks)
input = "[ABC] [DEF] [ABC] GHI IHG [DEF]" input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
encoded = tokenizer.encode(input, add_special_tokens=False) encoded = tokenizer.encode(input, add_special_tokens=False)
decoded = tokenizer.decode(encoded) decoded = tokenizer.decode(encoded)
self.assertEqual(decoded, input) self.assertEqual(decoded, input)
def test_pretrained_model_lists(self): def test_pretrained_model_lists(self):
weights_list = list(self.tokenizer_class.max_model_input_sizes.keys()) weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
weights_lists_2 = [] weights_lists_2 = []
for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items(): for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
weights_lists_2.append(list(map_list.keys())) weights_lists_2.append(list(map_list.keys()))
for weights_list_2 in weights_lists_2: for weights_list_2 in weights_lists_2:
self.assertListEqual(weights_list, weights_list_2) self.assertListEqual(weights_list, weights_list_2)
def test_mask_output(self): def test_mask_output(self):
if sys.version_info <= (3, 0): if sys.version_info <= (3, 0):
return return
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
seq_0 = "Test this method."
seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
sequences, mask = information["input_ids"], information["token_type_ids"]
self.assertEqual(len(sequences), len(mask))
def test_number_of_added_tokens(self):
tokenizer = self.get_tokenizer()
if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
seq_0 = "Test this method." seq_0 = "Test this method."
seq_1 = "With these inputs." seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False) sequences, mask = information["input_ids"], information["token_type_ids"]
attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) self.assertEqual(len(sequences), len(mask))
# Method is implemented (e.g. not GPT-2) def test_number_of_added_tokens(self):
if len(attached_sequences) != 2: tokenizer = self.get_tokenizer()
self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
seq_0 = "Test this method."
def test_maximum_encoding_length_single_input(self): seq_1 = "With these inputs."
tokenizer = self.get_tokenizer()
sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
seq_0 = "This is a sentence to be encoded." attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
stride = 2
# Method is implemented (e.g. not GPT-2)
sequence = tokenizer.encode(seq_0, add_special_tokens=False) if len(attached_sequences) != 2:
num_added_tokens = tokenizer.num_added_tokens() self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
total_length = len(sequence) + num_added_tokens
information = tokenizer.encode_plus( def test_maximum_encoding_length_single_input(self):
seq_0, tokenizer = self.get_tokenizer()
max_length=total_length - 2,
add_special_tokens=True, seq_0 = "This is a sentence to be encoded."
stride=stride, stride = 2
return_overflowing_tokens=True,
) sequence = tokenizer.encode(seq_0, add_special_tokens=False)
num_added_tokens = tokenizer.num_added_tokens()
truncated_sequence = information["input_ids"] total_length = len(sequence) + num_added_tokens
overflowing_tokens = information["overflowing_tokens"] information = tokenizer.encode_plus(
seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride, return_overflowing_tokens=True,
self.assertEqual(len(overflowing_tokens), 2 + stride) )
self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
self.assertEqual(len(truncated_sequence), total_length - 2) truncated_sequence = information["input_ids"]
self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2])) overflowing_tokens = information["overflowing_tokens"]
def test_maximum_encoding_length_pair_input(self): self.assertEqual(len(overflowing_tokens), 2 + stride)
tokenizer = self.get_tokenizer() self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
self.assertEqual(len(truncated_sequence), total_length - 2)
seq_0 = "This is a sentence to be encoded." self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
seq_1 = "This is another sentence to be encoded."
stride = 2 def test_maximum_encoding_length_pair_input(self):
tokenizer = self.get_tokenizer()
sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False) seq_0 = "This is a sentence to be encoded."
seq_1 = "This is another sentence to be encoded."
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True) stride = 2
truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
tokenizer.encode(seq_0, add_special_tokens=False), sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
tokenizer.encode(seq_1, add_special_tokens=False)[:-2], sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
)
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
information = tokenizer.encode_plus( truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
seq_0, tokenizer.encode(seq_0, add_special_tokens=False), tokenizer.encode(seq_1, add_special_tokens=False)[:-2],
seq_1, )
max_length=len(sequence) - 2,
add_special_tokens=True, information = tokenizer.encode_plus(
stride=stride, seq_0,
truncation_strategy="only_second", seq_1,
return_overflowing_tokens=True, max_length=len(sequence) - 2,
) add_special_tokens=True,
information_first_truncated = tokenizer.encode_plus( stride=stride,
seq_0, truncation_strategy="only_second",
seq_1, return_overflowing_tokens=True,
max_length=len(sequence) - 2, )
add_special_tokens=True, information_first_truncated = tokenizer.encode_plus(
stride=stride, seq_0,
truncation_strategy="only_first", seq_1,
return_overflowing_tokens=True, max_length=len(sequence) - 2,
) add_special_tokens=True,
stride=stride,
truncated_sequence = information["input_ids"] truncation_strategy="only_first",
overflowing_tokens = information["overflowing_tokens"] return_overflowing_tokens=True,
overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"] )
self.assertEqual(len(overflowing_tokens), 2 + stride) truncated_sequence = information["input_ids"]
self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :]) overflowing_tokens = information["overflowing_tokens"]
self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :]) overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
self.assertEqual(len(truncated_sequence), len(sequence) - 2)
self.assertEqual(truncated_sequence, truncated_second_sequence) self.assertEqual(len(overflowing_tokens), 2 + stride)
self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :])
def test_encode_input_type(self): self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :])
tokenizer = self.get_tokenizer() self.assertEqual(len(truncated_sequence), len(sequence) - 2)
self.assertEqual(truncated_sequence, truncated_second_sequence)
sequence = "Let's encode this sequence"
def test_encode_input_type(self):
tokens = tokenizer.tokenize(sequence) tokenizer = self.get_tokenizer()
input_ids = tokenizer.convert_tokens_to_ids(tokens)
formatted_input = tokenizer.encode(sequence, add_special_tokens=True) sequence = "Let's encode this sequence"
self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input) tokens = tokenizer.tokenize(sequence)
self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input) input_ids = tokenizer.convert_tokens_to_ids(tokens)
formatted_input = tokenizer.encode(sequence, add_special_tokens=True)
def test_special_tokens_mask(self):
tokenizer = self.get_tokenizer() self.assertEqual(tokenizer.encode(tokens, add_special_tokens=True), formatted_input)
self.assertEqual(tokenizer.encode(input_ids, add_special_tokens=True), formatted_input)
sequence_0 = "Encode this."
sequence_1 = "This one too please." def test_special_tokens_mask(self):
tokenizer = self.get_tokenizer()
# Testing single inputs
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) sequence_0 = "Encode this."
encoded_sequence_dict = tokenizer.encode_plus( sequence_1 = "This one too please."
sequence_0, add_special_tokens=True, return_special_tokens_mask=True
) # Testing single inputs
encoded_sequence_w_special = encoded_sequence_dict["input_ids"] encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] encoded_sequence_dict = tokenizer.encode_plus(
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) sequence_0, add_special_tokens=True, return_special_tokens_mask=True
)
filtered_sequence = [ encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
] self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence) filtered_sequence = [
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
# Testing inputs pairs ]
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode( filtered_sequence = [x for x in filtered_sequence if x is not None]
sequence_1, add_special_tokens=False self.assertEqual(encoded_sequence, filtered_sequence)
)
encoded_sequence_dict = tokenizer.encode_plus( # Testing inputs pairs
sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(
) sequence_1, add_special_tokens=False
encoded_sequence_w_special = encoded_sequence_dict["input_ids"] )
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"] encoded_sequence_dict = tokenizer.encode_plus(
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True
)
filtered_sequence = [ encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special) special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
] self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence) filtered_sequence = [
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
# Testing with already existing special tokens ]
if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id: filtered_sequence = [x for x in filtered_sequence if x is not None]
tokenizer.add_special_tokens({"cls_token": "</s>", "sep_token": "<s>"}) self.assertEqual(encoded_sequence, filtered_sequence)
encoded_sequence_dict = tokenizer.encode_plus(
sequence_0, add_special_tokens=True, return_special_tokens_mask=True # Testing with already existing special tokens
) if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
encoded_sequence_w_special = encoded_sequence_dict["input_ids"] tokenizer.add_special_tokens({"cls_token": "</s>", "sep_token": "<s>"})
special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"] encoded_sequence_dict = tokenizer.encode_plus(
special_tokens_mask = tokenizer.get_special_tokens_mask( sequence_0, add_special_tokens=True, return_special_tokens_mask=True
encoded_sequence_w_special, already_has_special_tokens=True )
) encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(special_tokens_mask_orig, special_tokens_mask) special_tokens_mask = tokenizer.get_special_tokens_mask(
encoded_sequence_w_special, already_has_special_tokens=True
def test_padding_to_max_length(self): )
tokenizer = self.get_tokenizer() self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
sequence = "Sequence"
padding_size = 10 def test_padding_to_max_length(self):
padding_idx = tokenizer.pad_token_id tokenizer = self.get_tokenizer()
# RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True sequence = "Sequence"
tokenizer.padding_side = "right" padding_size = 10
encoded_sequence = tokenizer.encode(sequence) padding_idx = tokenizer.pad_token_id
sequence_length = len(encoded_sequence)
padded_sequence = tokenizer.encode( # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
sequence, max_length=sequence_length + padding_size, pad_to_max_length=True tokenizer.padding_side = "right"
) encoded_sequence = tokenizer.encode(sequence)
padded_sequence_length = len(padded_sequence) sequence_length = len(encoded_sequence)
assert sequence_length + padding_size == padded_sequence_length padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
# LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
tokenizer.padding_side = "left"
encoded_sequence = tokenizer.encode(sequence) # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True
sequence_length = len(encoded_sequence) tokenizer.padding_side = "left"
padded_sequence = tokenizer.encode( encoded_sequence = tokenizer.encode(sequence)
sequence, max_length=sequence_length + padding_size, pad_to_max_length=True sequence_length = len(encoded_sequence)
) padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
padded_sequence_length = len(padded_sequence) padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length assert sequence_length + padding_size == padded_sequence_length
assert [padding_idx] * padding_size + encoded_sequence == padded_sequence assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
# RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified
encoded_sequence = tokenizer.encode(sequence) encoded_sequence = tokenizer.encode(sequence)
sequence_length = len(encoded_sequence) sequence_length = len(encoded_sequence)
tokenizer.padding_side = "right" tokenizer.padding_side = "right"
padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True) padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True)
padded_sequence_right_length = len(padded_sequence_right) padded_sequence_right_length = len(padded_sequence_right)
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True) padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True)
padded_sequence_left_length = len(padded_sequence_left) padded_sequence_left_length = len(padded_sequence_left)
assert sequence_length == padded_sequence_right_length assert sequence_length == padded_sequence_right_length
assert encoded_sequence == padded_sequence_right assert encoded_sequence == padded_sequence_right
assert sequence_length == padded_sequence_left_length assert sequence_length == padded_sequence_left_length
assert encoded_sequence == padded_sequence_left assert encoded_sequence == padded_sequence_left
def test_encode_plus_with_padding(self): def test_encode_plus_with_padding(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
sequence = "Sequence" sequence = "Sequence"
padding_size = 10 padding_size = 10
padding_idx = tokenizer.pad_token_id padding_idx = tokenizer.pad_token_id
token_type_padding_idx = tokenizer.pad_token_type_id token_type_padding_idx = tokenizer.pad_token_type_id
encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True) encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
input_ids = encoded_sequence["input_ids"] input_ids = encoded_sequence["input_ids"]
token_type_ids = encoded_sequence["token_type_ids"] token_type_ids = encoded_sequence["token_type_ids"]
attention_mask = encoded_sequence["attention_mask"] attention_mask = encoded_sequence["attention_mask"]
special_tokens_mask = encoded_sequence["special_tokens_mask"] special_tokens_mask = encoded_sequence["special_tokens_mask"]
sequence_length = len(input_ids) sequence_length = len(input_ids)
# Test right padding # Test right padding
tokenizer.padding_side = "right" tokenizer.padding_side = "right"
padded_sequence = tokenizer.encode_plus( padded_sequence = tokenizer.encode_plus(
sequence, sequence,
max_length=sequence_length + padding_size, max_length=sequence_length + padding_size,
pad_to_max_length=True, pad_to_max_length=True,
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
padded_input_ids = padded_sequence["input_ids"] padded_input_ids = padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"] padded_token_type_ids = padded_sequence["token_type_ids"]
padded_attention_mask = padded_sequence["attention_mask"] padded_attention_mask = padded_sequence["attention_mask"]
padded_special_tokens_mask = padded_sequence["special_tokens_mask"] padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids) padded_sequence_length = len(padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length assert sequence_length + padding_size == padded_sequence_length
assert input_ids + [padding_idx] * padding_size == padded_input_ids assert input_ids + [padding_idx] * padding_size == padded_input_ids
assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids
assert attention_mask + [0] * padding_size == padded_attention_mask assert attention_mask + [0] * padding_size == padded_attention_mask
assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask
# Test left padding # Test left padding
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
padded_sequence = tokenizer.encode_plus( padded_sequence = tokenizer.encode_plus(
sequence, sequence,
max_length=sequence_length + padding_size, max_length=sequence_length + padding_size,
pad_to_max_length=True, pad_to_max_length=True,
return_special_tokens_mask=True, return_special_tokens_mask=True,
) )
padded_input_ids = padded_sequence["input_ids"] padded_input_ids = padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"] padded_token_type_ids = padded_sequence["token_type_ids"]
padded_attention_mask = padded_sequence["attention_mask"] padded_attention_mask = padded_sequence["attention_mask"]
padded_special_tokens_mask = padded_sequence["special_tokens_mask"] padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids) padded_sequence_length = len(padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length assert sequence_length + padding_size == padded_sequence_length
assert [padding_idx] * padding_size + input_ids == padded_input_ids assert [padding_idx] * padding_size + input_ids == padded_input_ids
assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids
assert [0] * padding_size + attention_mask == padded_attention_mask assert [0] * padding_size + attention_mask == padded_attention_mask
assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask
...@@ -15,14 +15,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -15,14 +15,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json import json
import os import os
import unittest
from io import open from io import open
from transformers.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer from transformers.tokenization_ctrl import VOCAB_FILES_NAMES, CTRLTokenizer
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
class CTRLTokenizationTest(CommonTestCases.CommonTokenizerTester): class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CTRLTokenizer tokenizer_class = CTRLTokenizer
......
...@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json import json
import os import os
import unittest
from io import open from io import open
from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES, GPT2Tokenizer from transformers.tokenization_gpt2 import VOCAB_FILES_NAMES, GPT2Tokenizer
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = GPT2Tokenizer tokenizer_class = GPT2Tokenizer
......
...@@ -16,13 +16,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -16,13 +16,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json import json
import os import os
import unittest
from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer from transformers.tokenization_openai import VOCAB_FILES_NAMES, OpenAIGPTTokenizer
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester): class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = OpenAIGPTTokenizer tokenizer_class = OpenAIGPTTokenizer
......
...@@ -16,15 +16,16 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -16,15 +16,16 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json import json
import os import os
import unittest
from io import open from io import open
from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
from .utils import slow from .utils import slow
class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = RobertaTokenizer tokenizer_class = RobertaTokenizer
def setUp(self): def setUp(self):
......
...@@ -15,17 +15,18 @@ ...@@ -15,17 +15,18 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import os import os
import unittest
from transformers.tokenization_t5 import T5Tokenizer from transformers.tokenization_t5 import T5Tokenizer
from transformers.tokenization_xlnet import SPIECE_UNDERLINE from transformers.tokenization_xlnet import SPIECE_UNDERLINE
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
class T5TokenizationTest(CommonTestCases.CommonTokenizerTester): class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = T5Tokenizer tokenizer_class = T5Tokenizer
......
...@@ -15,11 +15,12 @@ ...@@ -15,11 +15,12 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import os import os
import unittest
from io import open from io import open
from transformers import is_torch_available from transformers import is_torch_available
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
from .utils import require_torch from .utils import require_torch
...@@ -28,7 +29,7 @@ if is_torch_available(): ...@@ -28,7 +29,7 @@ if is_torch_available():
@require_torch @require_torch
class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester): class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = TransfoXLTokenizer if is_torch_available() else None tokenizer_class = TransfoXLTokenizer if is_torch_available() else None
......
...@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -16,14 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import json import json
import os import os
import unittest
from transformers.tokenization_xlm import VOCAB_FILES_NAMES, XLMTokenizer from transformers.tokenization_xlm import VOCAB_FILES_NAMES, XLMTokenizer
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
from .utils import slow from .utils import slow
class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMTokenizer tokenizer_class = XLMTokenizer
......
...@@ -15,17 +15,18 @@ ...@@ -15,17 +15,18 @@
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import os import os
import unittest
from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
from .test_tokenization_commo import CommonTestCases from .test_tokenization_common import TokenizerTesterMixin
from .utils import slow from .utils import slow
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model") SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLNetTokenizer tokenizer_class = XLNetTokenizer
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment