Unverified Commit 54abc67a authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #2255 from aaugustin/implement-best-practices

Implement some Python best practices
parents 645713e2 c11b3e29
......@@ -15,30 +15,35 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import shutil
import sys
from io import open
import tempfile
import shutil
import unittest
from io import open
if sys.version_info[0] == 2:
import cPickle as pickle
class TemporaryDirectory(object):
"""Context manager for tempfile.mkdtemp() so it's usable with "with" statement."""
def __enter__(self):
self.name = tempfile.mkdtemp()
return self.name
def __exit__(self, exc_type, exc_value, traceback):
shutil.rmtree(self.name)
else:
import pickle
TemporaryDirectory = tempfile.TemporaryDirectory
unicode = str
class CommonTestCases:
class CommonTokenizerTester(unittest.TestCase):
tokenizer_class = None
......@@ -57,17 +62,23 @@ class CommonTestCases:
def test_tokenizers_common_properties(self):
tokenizer = self.get_tokenizer()
attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token",
"pad_token", "cls_token", "mask_token"]
attributes_list = [
"bos_token",
"eos_token",
"unk_token",
"sep_token",
"pad_token",
"cls_token",
"mask_token",
]
for attr in attributes_list:
self.assertTrue(hasattr(tokenizer, attr))
self.assertTrue(hasattr(tokenizer, attr + "_id"))
self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids'))
self.assertTrue(hasattr(tokenizer, "additional_special_tokens_ids"))
attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder",
"added_tokens_decoder"]
attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder", "added_tokens_decoder"]
for attr in attributes_list:
self.assertTrue(hasattr(tokenizer, attr))
......@@ -79,13 +90,13 @@ class CommonTestCases:
# Now let's start the test
tokenizer = self.get_tokenizer(max_len=42)
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
before_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
with TemporaryDirectory() as tmpdirname:
tokenizer.save_pretrained(tmpdirname)
tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
after_tokens = tokenizer.encode("He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
self.assertListEqual(before_tokens, after_tokens)
self.assertEqual(tokenizer.max_len, 42)
......@@ -96,12 +107,12 @@ class CommonTestCases:
tokenizer = self.get_tokenizer()
self.assertIsNotNone(tokenizer)
text = u"Munich and Berlin are nice cities"
text = "Munich and Berlin are nice cities"
subwords = tokenizer.tokenize(text)
with TemporaryDirectory() as tmpdirname:
filename = os.path.join(tmpdirname, u"tokenizer.bin")
filename = os.path.join(tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)
......@@ -122,7 +133,7 @@ class CommonTestCases:
toks0 = tokenizer.tokenize(text) # toks before adding new_toks
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", 'AAAAA BBBBBB', 'CCCCCCCCCDDDDDDDD']
new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
added = tokenizer.add_tokens(new_toks)
self.assertEqual(added, 2)
......@@ -178,8 +189,7 @@ class CommonTestCases:
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
'pad_token': "<<<<<|||>|>>>>|>"}
new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||>|>>>>|>"}
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer)
......@@ -189,8 +199,9 @@ class CommonTestCases:
self.assertEqual(added_toks_2, len(new_toks_2))
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
add_special_tokens=False)
tokens = tokenizer.encode(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
)
out_string = tokenizer.decode(tokens)
self.assertGreaterEqual(len(tokens), 6)
......@@ -242,7 +253,7 @@ class CommonTestCases:
def test_encode_decode_with_spaces(self):
tokenizer = self.get_tokenizer()
new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
new_toks = ["[ABC]", "[DEF]", "GHI IHG"]
tokenizer.add_tokens(new_toks)
input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
encoded = tokenizer.encode(input, add_special_tokens=False)
......@@ -264,7 +275,7 @@ class CommonTestCases:
tokenizer = self.get_tokenizer()
if tokenizer.build_inputs_with_special_tokens.__qualname__.split('.')[0] != "PreTrainedTokenizer":
if tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer":
seq_0 = "Test this method."
seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
......@@ -293,17 +304,19 @@ class CommonTestCases:
sequence = tokenizer.encode(seq_0, add_special_tokens=False)
num_added_tokens = tokenizer.num_added_tokens()
total_length = len(sequence) + num_added_tokens
information = tokenizer.encode_plus(seq_0,
information = tokenizer.encode_plus(
seq_0,
max_length=total_length - 2,
add_special_tokens=True,
stride=stride,
return_overflowing_tokens=True)
return_overflowing_tokens=True,
)
truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"]
self.assertEqual(len(overflowing_tokens), 2 + stride)
self.assertEqual(overflowing_tokens, sequence[-(2 + stride):])
self.assertEqual(overflowing_tokens, sequence[-(2 + stride) :])
self.assertEqual(len(truncated_sequence), total_length - 2)
self.assertEqual(truncated_sequence, tokenizer.build_inputs_with_special_tokens(sequence[:-2]))
......@@ -320,24 +333,35 @@ class CommonTestCases:
sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
tokenizer.encode(seq_0, add_special_tokens=False),
tokenizer.encode(seq_1, add_special_tokens=False)[:-2]
tokenizer.encode(seq_1, add_special_tokens=False)[:-2],
)
information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
stride=stride, truncation_strategy='only_second',
return_overflowing_tokens=True)
information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
add_special_tokens=True, stride=stride,
truncation_strategy='only_first',
return_overflowing_tokens=True)
information = tokenizer.encode_plus(
seq_0,
seq_1,
max_length=len(sequence) - 2,
add_special_tokens=True,
stride=stride,
truncation_strategy="only_second",
return_overflowing_tokens=True,
)
information_first_truncated = tokenizer.encode_plus(
seq_0,
seq_1,
max_length=len(sequence) - 2,
add_special_tokens=True,
stride=stride,
truncation_strategy="only_first",
return_overflowing_tokens=True,
)
truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"]
overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
self.assertEqual(len(overflowing_tokens), 2 + stride)
self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride):])
self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride):])
self.assertEqual(overflowing_tokens, sequence_1_no_special_tokens[-(2 + stride) :])
self.assertEqual(overflowing_tokens_first_truncated, sequence_0_no_special_tokens[-(2 + stride) :])
self.assertEqual(len(truncated_sequence), len(sequence) - 2)
self.assertEqual(truncated_sequence, truncated_second_sequence)
......@@ -361,37 +385,47 @@ class CommonTestCases:
# Testing single inputs
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True, return_special_tokens_mask=True)
encoded_sequence_dict = tokenizer.encode_plus(
sequence_0, add_special_tokens=True, return_special_tokens_mask=True
)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
]
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence)
# Testing inputs pairs
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(sequence_1,
add_special_tokens=False)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True,
return_special_tokens_mask=True)
encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(
sequence_1, add_special_tokens=False
)
encoded_sequence_dict = tokenizer.encode_plus(
sequence_0, sequence_1, add_special_tokens=True, return_special_tokens_mask=True
)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
filtered_sequence = [(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [
(x if not special_tokens_mask[i] else None) for i, x in enumerate(encoded_sequence_w_special)
]
filtered_sequence = [x for x in filtered_sequence if x is not None]
self.assertEqual(encoded_sequence, filtered_sequence)
# Testing with already existing special tokens
if tokenizer.cls_token_id == tokenizer.unk_token_id and tokenizer.cls_token_id == tokenizer.unk_token_id:
tokenizer.add_special_tokens({'cls_token': '</s>', 'sep_token': '<s>'})
encoded_sequence_dict = tokenizer.encode_plus(sequence_0,
add_special_tokens=True,
return_special_tokens_mask=True)
tokenizer.add_special_tokens({"cls_token": "</s>", "sep_token": "<s>"})
encoded_sequence_dict = tokenizer.encode_plus(
sequence_0, add_special_tokens=True, return_special_tokens_mask=True
)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
special_tokens_mask_orig = encoded_sequence_dict["special_tokens_mask"]
special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True)
special_tokens_mask = tokenizer.get_special_tokens_mask(
encoded_sequence_w_special, already_has_special_tokens=True
)
self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special))
self.assertEqual(special_tokens_mask_orig, special_tokens_mask)
......@@ -406,7 +440,9 @@ class CommonTestCases:
tokenizer.padding_side = "right"
encoded_sequence = tokenizer.encode(sequence)
sequence_length = len(encoded_sequence)
padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
padded_sequence = tokenizer.encode(
sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
......@@ -415,7 +451,9 @@ class CommonTestCases:
tokenizer.padding_side = "left"
encoded_sequence = tokenizer.encode(sequence)
sequence_length = len(encoded_sequence)
padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True)
padded_sequence = tokenizer.encode(
sequence, max_length=sequence_length + padding_size, pad_to_max_length=True
)
padded_sequence_length = len(padded_sequence)
assert sequence_length + padding_size == padded_sequence_length
assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
......@@ -446,19 +484,24 @@ class CommonTestCases:
token_type_padding_idx = tokenizer.pad_token_type_id
encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True)
input_ids = encoded_sequence['input_ids']
token_type_ids = encoded_sequence['token_type_ids']
attention_mask = encoded_sequence['attention_mask']
special_tokens_mask = encoded_sequence['special_tokens_mask']
input_ids = encoded_sequence["input_ids"]
token_type_ids = encoded_sequence["token_type_ids"]
attention_mask = encoded_sequence["attention_mask"]
special_tokens_mask = encoded_sequence["special_tokens_mask"]
sequence_length = len(input_ids)
# Test right padding
tokenizer.padding_side = "right"
padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
padded_input_ids = padded_sequence['input_ids']
padded_token_type_ids = padded_sequence['token_type_ids']
padded_attention_mask = padded_sequence['attention_mask']
padded_special_tokens_mask = padded_sequence['special_tokens_mask']
padded_sequence = tokenizer.encode_plus(
sequence,
max_length=sequence_length + padding_size,
pad_to_max_length=True,
return_special_tokens_mask=True,
)
padded_input_ids = padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"]
padded_attention_mask = padded_sequence["attention_mask"]
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length
......@@ -469,11 +512,16 @@ class CommonTestCases:
# Test left padding
tokenizer.padding_side = "left"
padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True)
padded_input_ids = padded_sequence['input_ids']
padded_token_type_ids = padded_sequence['token_type_ids']
padded_attention_mask = padded_sequence['attention_mask']
padded_special_tokens_mask = padded_sequence['special_tokens_mask']
padded_sequence = tokenizer.encode_plus(
sequence,
max_length=sequence_length + padding_size,
pad_to_max_length=True,
return_special_tokens_mask=True,
)
padded_input_ids = padded_sequence["input_ids"]
padded_token_type_ids = padded_sequence["token_type_ids"]
padded_attention_mask = padded_sequence["attention_mask"]
padded_special_tokens_mask = padded_sequence["special_tokens_mask"]
padded_sequence_length = len(padded_input_ids)
assert sequence_length + padding_size == padded_sequence_length
......
......@@ -20,14 +20,14 @@ from io import open
from transformers import is_torch_available
if is_torch_available():
import torch
from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
from .tokenization_tests_commons import CommonTestCases
from .utils import require_torch
if is_torch_available():
from transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
@require_torch
class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
......@@ -37,45 +37,53 @@ class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
super(TransfoXLTokenizationTest, self).setUp()
vocab_tokens = [
"<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
"running", ",", "low", "l",
"<unk>",
"[CLS]",
"[SEP]",
"want",
"unwanted",
"wa",
"un",
"running",
",",
"low",
"l",
]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self, **kwargs):
kwargs['lower_case'] = True
kwargs["lower_case"] = True
return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self):
input_text = u"<unk> UNwanted , running"
output_text = u"<unk> unwanted, running"
input_text = "<unk> UNwanted , running"
output_text = "<unk> unwanted, running"
return input_text, output_text
def test_full_tokenizer(self):
tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
tokens = tokenizer.tokenize("<unk> UNwanted , running")
self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
def test_full_tokenizer_lower(self):
tokenizer = TransfoXLTokenizer(lower_case=True)
self.assertListEqual(
tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "),
["hello", "!", "how", "are", "you", "?"])
tokenizer.tokenize(" \tHeLLo ! how \n Are yoU ? "), ["hello", "!", "how", "are", "you", "?"]
)
def test_full_tokenizer_no_lower(self):
tokenizer = TransfoXLTokenizer(lower_case=False)
self.assertListEqual(
tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "),
["HeLLo", "!", "how", "Are", "yoU", "?"])
tokenizer.tokenize(" \tHeLLo ! how \n Are yoU ? "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
)
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
......@@ -12,11 +12,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import, division, print_function
import unittest
import six
from transformers import PreTrainedTokenizer
......@@ -24,8 +23,8 @@ from transformers.tokenization_gpt2 import GPT2Tokenizer
from .utils import slow
class TokenizerUtilsTest(unittest.TestCase):
class TokenizerUtilsTest(unittest.TestCase):
def check_tokenizer_from_pretrained(self, tokenizer_class):
s3_models = list(tokenizer_class.max_model_input_sizes.keys())
for model_name in s3_models[:1]:
......@@ -36,7 +35,7 @@ class TokenizerUtilsTest(unittest.TestCase):
for special_tok in tokenizer.all_special_tokens:
if six.PY2:
self.assertIsInstance(special_tok, unicode)
self.assertIsInstance(special_tok, unicode) # noqa: F821
else:
self.assertIsInstance(special_tok, str)
special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
......@@ -46,5 +45,6 @@ class TokenizerUtilsTest(unittest.TestCase):
def test_pretrained_tokenizers(self):
self.check_tokenizer_from_pretrained(GPT2Tokenizer)
if __name__ == "__main__":
unittest.main()
......@@ -14,15 +14,16 @@
# limitations under the License.
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import os
import unittest
import json
from transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
from transformers.tokenization_xlm import VOCAB_FILES_NAMES, XLMTokenizer
from .tokenization_tests_commons import CommonTestCases
from .utils import slow
class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer_class = XLMTokenizer
......@@ -31,15 +32,34 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
super(XLMTokenizationTest, self).setUp()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
"w</w>", "r</w>", "t</w>",
"lo", "low", "er</w>",
"low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
vocab = [
"l",
"o",
"w",
"e",
"r",
"s",
"t",
"i",
"d",
"n",
"w</w>",
"r</w>",
"t</w>",
"lo",
"low",
"er</w>",
"low</w>",
"lowest</w>",
"newer</w>",
"wider</w>",
"<unk>",
]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
with open(self.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp:
......@@ -49,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self):
input_text = u"lower newer"
output_text = u"lower newer"
input_text = "lower newer"
output_text = "lower newer"
return input_text, output_text
def test_full_tokenizer(self):
......@@ -64,8 +84,7 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
input_tokens = tokens + ["<unk>"]
input_bpe_tokens = [14, 15, 20]
self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
@slow
def test_sequence_builders(self):
......@@ -80,5 +99,6 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert encoded_sentence == [1] + text + [1]
assert encoded_pair == [1] + text + [1] + text_2 + [1]
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
......@@ -17,13 +17,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import os
import unittest
from transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
from transformers.tokenization_xlnet import SPIECE_UNDERLINE, XLNetTokenizer
from .tokenization_tests_commons import CommonTestCases
from .utils import slow
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'fixtures/test_sentencepiece.model')
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixtures/test_sentencepiece.model")
class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
......@@ -40,55 +41,135 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self):
input_text = u"This is a test"
output_text = u"This is a test"
input_text = "This is a test"
output_text = "This is a test"
return input_text, output_text
def test_full_tokenizer(self):
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokens = tokenizer.tokenize(u'This is a test')
self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
tokens = tokenizer.tokenize("This is a test")
self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
ids = tokenizer.convert_tokens_to_ids(tokens)
tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
self.assertListEqual(
ids, [8, 21, 84, 55, 24, 19, 7, 0,
602, 347, 347, 347, 3, 12, 66,
46, 72, 80, 6, 0, 4])
tokens,
[
SPIECE_UNDERLINE + "I",
SPIECE_UNDERLINE + "was",
SPIECE_UNDERLINE + "b",
"or",
"n",
SPIECE_UNDERLINE + "in",
SPIECE_UNDERLINE + "",
"9",
"2",
"0",
"0",
"0",
",",
SPIECE_UNDERLINE + "and",
SPIECE_UNDERLINE + "this",
SPIECE_UNDERLINE + "is",
SPIECE_UNDERLINE + "f",
"al",
"s",
"é",
".",
],
)
ids = tokenizer.convert_tokens_to_ids(tokens)
self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
back_tokens = tokenizer.convert_ids_to_tokens(ids)
self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
u'or', u'n', SPIECE_UNDERLINE + u'in',
SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
u'<unk>', u'.'])
self.assertListEqual(
back_tokens,
[
SPIECE_UNDERLINE + "I",
SPIECE_UNDERLINE + "was",
SPIECE_UNDERLINE + "b",
"or",
"n",
SPIECE_UNDERLINE + "in",
SPIECE_UNDERLINE + "",
"<unk>",
"2",
"0",
"0",
"0",
",",
SPIECE_UNDERLINE + "and",
SPIECE_UNDERLINE + "this",
SPIECE_UNDERLINE + "is",
SPIECE_UNDERLINE + "f",
"al",
"s",
"<unk>",
".",
],
)
def test_tokenizer_lower(self):
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"▁he", u"ll", u"o"])
tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
self.assertListEqual(
tokens,
[
SPIECE_UNDERLINE + "",
"i",
SPIECE_UNDERLINE + "was",
SPIECE_UNDERLINE + "b",
"or",
"n",
SPIECE_UNDERLINE + "in",
SPIECE_UNDERLINE + "",
"9",
"2",
"0",
"0",
"0",
",",
SPIECE_UNDERLINE + "and",
SPIECE_UNDERLINE + "this",
SPIECE_UNDERLINE + "is",
SPIECE_UNDERLINE + "f",
"al",
"se",
".",
],
)
self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["▁he", "ll", "o"])
def test_tokenizer_no_lower(self):
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or',
u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
self.assertListEqual(
tokens,
[
SPIECE_UNDERLINE + "I",
SPIECE_UNDERLINE + "was",
SPIECE_UNDERLINE + "b",
"or",
"n",
SPIECE_UNDERLINE + "in",
SPIECE_UNDERLINE + "",
"9",
"2",
"0",
"0",
"0",
",",
SPIECE_UNDERLINE + "and",
SPIECE_UNDERLINE + "this",
SPIECE_UNDERLINE + "is",
SPIECE_UNDERLINE + "f",
"al",
"se",
".",
],
)
@slow
def test_sequence_builders(self):
......@@ -104,5 +185,5 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
assert encoded_pair == text + [4] + text_2 + [4, 3]
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
import os
import unittest
import tempfile
import unittest
from distutils.util import strtobool
from transformers.file_utils import _tf_available, _torch_available
......@@ -27,6 +26,7 @@ def parse_flag_from_env(key, default=False):
raise ValueError("If set, {} must be yes or no.".format(key))
return _value
_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
......
......@@ -13,45 +13,47 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for ALBERT model."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
from .tokenization_utils import PreTrainedTokenizer
import logging
import unicodedata
import six
import os
import unicodedata
from shutil import copyfile
import six
from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'albert-base-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
'albert-large-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
'albert-xlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
'albert-xxlarge-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
'albert-base-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model",
'albert-large-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model",
'albert-xlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model",
'albert-xxlarge-v2': "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model",
"vocab_file": {
"albert-base-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-spiece.model",
"albert-large-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-spiece.model",
"albert-xlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-spiece.model",
"albert-xxlarge-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-spiece.model",
"albert-base-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model",
"albert-large-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-large-v2-spiece.model",
"albert-xlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xlarge-v2-spiece.model",
"albert-xxlarge-v2": "https://s3.amazonaws.com/models.huggingface.co/bert/albert-xxlarge-v2-spiece.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'albert-base-v1': 512,
'albert-large-v1': 512,
'albert-xlarge-v1': 512,
'albert-xxlarge-v1': 512,
'albert-base-v2': 512,
'albert-large-v2': 512,
'albert-xlarge-v2': 512,
'albert-xxlarge-v2': 512,
"albert-base-v1": 512,
"albert-large-v1": 512,
"albert-xlarge-v1": 512,
"albert-xxlarge-v1": 512,
"albert-base-v2": 512,
"albert-large-v2": 512,
"albert-xlarge-v2": 512,
"albert-xxlarge-v2": 512,
}
SPIECE_UNDERLINE = u'▁'
SPIECE_UNDERLINE = "▁"
class AlbertTokenizer(PreTrainedTokenizer):
"""
......@@ -59,18 +61,36 @@ class AlbertTokenizer(PreTrainedTokenizer):
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file,
do_lower_case=True, remove_space=True, keep_accents=False,
bos_token="[CLS]", eos_token="[SEP]", unk_token="<unk>", sep_token="[SEP]",
pad_token="<pad>", cls_token="[CLS]", mask_token="[MASK]", **kwargs):
super(AlbertTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
unk_token=unk_token, sep_token=sep_token,
pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, **kwargs)
def __init__(
self,
vocab_file,
do_lower_case=True,
remove_space=True,
keep_accents=False,
bos_token="[CLS]",
eos_token="[SEP]",
unk_token="<unk>",
sep_token="[SEP]",
pad_token="<pad>",
cls_token="[CLS]",
mask_token="[MASK]",
**kwargs
):
super(AlbertTokenizer, self).__init__(
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs
)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
......@@ -78,8 +98,10 @@ class AlbertTokenizer(PreTrainedTokenizer):
try:
import sentencepiece as spm
except ImportError:
logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece")
logger.warning(
"You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self.do_lower_case = do_lower_case
self.remove_space = remove_space
......@@ -103,24 +125,26 @@ class AlbertTokenizer(PreTrainedTokenizer):
try:
import sentencepiece as spm
except ImportError:
logger.warning("You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece")
logger.warning(
"You need to install SentencePiece to use AlbertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
def preprocess_text(self, inputs):
if self.remove_space:
outputs = ' '.join(inputs.strip().split())
outputs = " ".join(inputs.strip().split())
else:
outputs = inputs
outputs = outputs.replace("``", '"').replace("''", '"')
if six.PY2 and isinstance(outputs, str):
outputs = outputs.decode('utf-8')
outputs = outputs.decode("utf-8")
if not self.keep_accents:
outputs = unicodedata.normalize('NFKD', outputs)
outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
outputs = unicodedata.normalize("NFKD", outputs)
outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
if self.do_lower_case:
outputs = outputs.lower()
......@@ -132,8 +156,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
"""
text = self.preprocess_text(text)
# note(zhiliny): in some systems, sentencepiece only accepts str for py2
if six.PY2 and isinstance(text, unicode):
text = text.encode('utf-8')
if six.PY2 and isinstance(text, unicode): # noqa: F821
text = text.encode("utf-8")
if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
......@@ -141,9 +165,8 @@ class AlbertTokenizer(PreTrainedTokenizer):
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(',') and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(
piece[:-1].replace(SPIECE_UNDERLINE, ''))
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
if len(cur_pieces[0]) == 1:
cur_pieces = cur_pieces[1:]
......@@ -159,7 +182,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
ret_pieces = []
for piece in new_pieces:
if isinstance(piece, str):
piece = piece.decode('utf-8')
piece = piece.decode("utf-8")
ret_pieces.append(piece)
new_pieces = ret_pieces
......@@ -173,12 +196,12 @@ class AlbertTokenizer(PreTrainedTokenizer):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
token = self.sp_model.IdToPiece(index)
if six.PY2 and return_unicode and isinstance(token, str):
token = token.decode('utf-8')
token = token.decode("utf-8")
return token
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
......@@ -213,8 +236,10 @@ class AlbertTokenizer(PreTrainedTokenizer):
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
......@@ -244,7 +269,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
......
......@@ -18,23 +18,25 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import logging
from .tokenization_albert import AlbertTokenizer
from .tokenization_bert import BertTokenizer
from .tokenization_bert_japanese import BertJapaneseTokenizer
from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_gpt2 import GPT2Tokenizer
from .tokenization_camembert import CamembertTokenizer
from .tokenization_ctrl import CTRLTokenizer
from .tokenization_transfo_xl import TransfoXLTokenizer
from .tokenization_xlnet import XLNetTokenizer
from .tokenization_xlm import XLMTokenizer
from .tokenization_roberta import RobertaTokenizer
from .tokenization_distilbert import DistilBertTokenizer
from .tokenization_camembert import CamembertTokenizer
from .tokenization_albert import AlbertTokenizer
from .tokenization_gpt2 import GPT2Tokenizer
from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_roberta import RobertaTokenizer
from .tokenization_t5 import T5Tokenizer
from .tokenization_transfo_xl import TransfoXLTokenizer
from .tokenization_xlm import XLMTokenizer
from .tokenization_xlm_roberta import XLMRobertaTokenizer
from .tokenization_xlnet import XLNetTokenizer
logger = logging.getLogger(__name__)
class AutoTokenizer(object):
r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class
that will be instantiated as one of the tokenizer classes of the library
......@@ -62,9 +64,12 @@ class AutoTokenizer(object):
This class cannot be instantiated using `__init__()` (throw an error).
"""
def __init__(self):
raise EnvironmentError("AutoTokenizer is designed to be instantiated "
"using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
raise EnvironmentError(
"AutoTokenizer is designed to be instantiated "
"using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
......@@ -125,34 +130,38 @@ class AutoTokenizer(object):
tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
"""
if 't5' in pretrained_model_name_or_path:
if "t5" in pretrained_model_name_or_path:
return T5Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'distilbert' in pretrained_model_name_or_path:
elif "distilbert" in pretrained_model_name_or_path:
return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'albert' in pretrained_model_name_or_path:
elif "albert" in pretrained_model_name_or_path:
return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'camembert' in pretrained_model_name_or_path:
elif "camembert" in pretrained_model_name_or_path:
return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'xlm-roberta' in pretrained_model_name_or_path:
elif "xlm-roberta" in pretrained_model_name_or_path:
return XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
elif "roberta" in pretrained_model_name_or_path:
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'bert-base-japanese' in pretrained_model_name_or_path:
elif "bert-base-japanese" in pretrained_model_name_or_path:
return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
elif "bert" in pretrained_model_name_or_path:
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path:
elif "openai-gpt" in pretrained_model_name_or_path:
return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'gpt2' in pretrained_model_name_or_path:
elif "gpt2" in pretrained_model_name_or_path:
return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'transfo-xl' in pretrained_model_name_or_path:
elif "transfo-xl" in pretrained_model_name_or_path:
return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
elif "xlnet" in pretrained_model_name_or_path:
return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
elif "xlm" in pretrained_model_name_or_path:
return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'ctrl' in pretrained_model_name_or_path:
elif "ctrl" in pretrained_model_name_or_path:
return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
raise ValueError(
"Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
"'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(
pretrained_model_name_or_path
)
)
......@@ -24,71 +24,71 @@ from io import open
from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
"vocab_file": {
"bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
"bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
"bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
"bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
"bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
"bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
"bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
"bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
"bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
"bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
"bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
"bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
"bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
"bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
"bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
"bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
"bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'bert-base-uncased': 512,
'bert-large-uncased': 512,
'bert-base-cased': 512,
'bert-large-cased': 512,
'bert-base-multilingual-uncased': 512,
'bert-base-multilingual-cased': 512,
'bert-base-chinese': 512,
'bert-base-german-cased': 512,
'bert-large-uncased-whole-word-masking': 512,
'bert-large-cased-whole-word-masking': 512,
'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
'bert-large-cased-whole-word-masking-finetuned-squad': 512,
'bert-base-cased-finetuned-mrpc': 512,
'bert-base-german-dbmdz-cased': 512,
'bert-base-german-dbmdz-uncased': 512,
'bert-base-finnish-cased-v1': 512,
'bert-base-finnish-uncased-v1': 512,
"bert-base-uncased": 512,
"bert-large-uncased": 512,
"bert-base-cased": 512,
"bert-large-cased": 512,
"bert-base-multilingual-uncased": 512,
"bert-base-multilingual-cased": 512,
"bert-base-chinese": 512,
"bert-base-german-cased": 512,
"bert-large-uncased-whole-word-masking": 512,
"bert-large-cased-whole-word-masking": 512,
"bert-large-uncased-whole-word-masking-finetuned-squad": 512,
"bert-large-cased-whole-word-masking-finetuned-squad": 512,
"bert-base-cased-finetuned-mrpc": 512,
"bert-base-german-dbmdz-cased": 512,
"bert-base-german-dbmdz-uncased": 512,
"bert-base-finnish-cased-v1": 512,
"bert-base-finnish-uncased-v1": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
'bert-base-uncased': {'do_lower_case': True},
'bert-large-uncased': {'do_lower_case': True},
'bert-base-cased': {'do_lower_case': False},
'bert-large-cased': {'do_lower_case': False},
'bert-base-multilingual-uncased': {'do_lower_case': True},
'bert-base-multilingual-cased': {'do_lower_case': False},
'bert-base-chinese': {'do_lower_case': False},
'bert-base-german-cased': {'do_lower_case': False},
'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
'bert-large-cased-whole-word-masking': {'do_lower_case': False},
'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
'bert-base-german-dbmdz-cased': {'do_lower_case': False},
'bert-base-german-dbmdz-uncased': {'do_lower_case': True},
'bert-base-finnish-cased-v1': {'do_lower_case': False},
'bert-base-finnish-uncased-v1': {'do_lower_case': True},
"bert-base-uncased": {"do_lower_case": True},
"bert-large-uncased": {"do_lower_case": True},
"bert-base-cased": {"do_lower_case": False},
"bert-large-cased": {"do_lower_case": False},
"bert-base-multilingual-uncased": {"do_lower_case": True},
"bert-base-multilingual-cased": {"do_lower_case": False},
"bert-base-chinese": {"do_lower_case": False},
"bert-base-german-cased": {"do_lower_case": False},
"bert-large-uncased-whole-word-masking": {"do_lower_case": True},
"bert-large-cased-whole-word-masking": {"do_lower_case": False},
"bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
"bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
"bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
"bert-base-german-dbmdz-cased": {"do_lower_case": False},
"bert-base-german-dbmdz-uncased": {"do_lower_case": True},
"bert-base-finnish-cased-v1": {"do_lower_case": False},
"bert-base-finnish-uncased-v1": {"do_lower_case": True},
}
......@@ -98,7 +98,7 @@ def load_vocab(vocab_file):
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip('\n')
token = token.rstrip("\n")
vocab[token] = index
return vocab
......@@ -132,9 +132,20 @@ class BertTokenizer(PreTrainedTokenizer):
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
**kwargs
):
"""Constructs a BertTokenizer.
Args:
......@@ -152,24 +163,29 @@ class BertTokenizer(PreTrainedTokenizer):
This should likely be deactivated for Japanese:
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
"""
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, **kwargs)
super(BertTokenizer, self).__init__(
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs
)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars)
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
@property
......@@ -196,7 +212,7 @@ class BertTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
out_string = ' '.join(tokens).replace(' ##', '').strip()
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
......@@ -231,8 +247,10 @@ class BertTokenizer(PreTrainedTokenizer):
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
......@@ -258,16 +276,18 @@ class BertTokenizer(PreTrainedTokenizer):
"""Save the tokenizer vocabulary to a directory or file."""
index = 0
if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
else:
vocab_file = vocab_path
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!".format(vocab_file))
logger.warning(
"Saving vocabulary to {}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!".format(vocab_file)
)
index = token_index
writer.write(token + u'\n')
writer.write(token + "\n")
index += 1
return (vocab_file,)
......@@ -382,14 +402,16 @@ class BasicTokenizer(object):
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
......@@ -399,7 +421,7 @@ class BasicTokenizer(object):
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
......@@ -499,8 +521,7 @@ def _is_punctuation(char):
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
......
......@@ -19,55 +19,54 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import collections
import logging
import os
import six
import unicodedata
from io import open
from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab
from .tokenization_utils import PreTrainedTokenizer
import six
from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
"vocab_file": {
"bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
"bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
"bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
"bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'bert-base-japanese': 512,
'bert-base-japanese-whole-word-masking': 512,
'bert-base-japanese-char': 512,
'bert-base-japanese-char-whole-word-masking': 512
"bert-base-japanese": 512,
"bert-base-japanese-whole-word-masking": 512,
"bert-base-japanese-char": 512,
"bert-base-japanese-char-whole-word-masking": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
'bert-base-japanese': {
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'wordpiece'
"bert-base-japanese": {
"do_lower_case": False,
"word_tokenizer_type": "mecab",
"subword_tokenizer_type": "wordpiece",
},
'bert-base-japanese-whole-word-masking':{
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'wordpiece'
"bert-base-japanese-whole-word-masking": {
"do_lower_case": False,
"word_tokenizer_type": "mecab",
"subword_tokenizer_type": "wordpiece",
},
'bert-base-japanese-char': {
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'character'
"bert-base-japanese-char": {
"do_lower_case": False,
"word_tokenizer_type": "mecab",
"subword_tokenizer_type": "character",
},
"bert-base-japanese-char-whole-word-masking": {
"do_lower_case": False,
"word_tokenizer_type": "mecab",
"subword_tokenizer_type": "character",
},
'bert-base-japanese-char-whole-word-masking': {
'do_lower_case': False,
'word_tokenizer_type': 'mecab',
'subword_tokenizer_type': 'character'
}
}
......@@ -79,11 +78,22 @@ class BertJapaneseTokenizer(BertTokenizer):
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, do_lower_case=False,
do_word_tokenize=True, do_subword_tokenize=True,
word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
never_split=None, unk_token='[UNK]', sep_token='[SEP]',
pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
def __init__(
self,
vocab_file,
do_lower_case=False,
do_word_tokenize=True,
do_subword_tokenize=True,
word_tokenizer_type="basic",
subword_tokenizer_type="wordpiece",
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
**kwargs
):
"""Constructs a MecabBertTokenizer.
Args:
......@@ -100,56 +110,53 @@ class BertJapaneseTokenizer(BertTokenizer):
**subword_tokenizer_type**: (`optional`) string (default "wordpiece")
Type of subword tokenizer.
"""
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, **kwargs)
super(BertTokenizer, self).__init__(
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs
)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
)
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
self.do_word_tokenize = do_word_tokenize
if do_word_tokenize:
if word_tokenizer_type == 'basic':
self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=False)
elif word_tokenizer_type == 'mecab':
self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
never_split=never_split)
if word_tokenizer_type == "basic":
self.word_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
)
elif word_tokenizer_type == "mecab":
self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case, never_split=never_split)
else:
raise ValueError(
"Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
self.do_subword_tokenize = do_subword_tokenize
if do_subword_tokenize:
if subword_tokenizer_type == 'wordpiece':
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
unk_token=self.unk_token)
elif subword_tokenizer_type == 'character':
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
unk_token=self.unk_token)
if subword_tokenizer_type == "wordpiece":
self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
elif subword_tokenizer_type == "character":
self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=self.unk_token)
else:
raise ValueError(
"Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
raise ValueError("Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
def _tokenize(self, text):
if self.do_word_tokenize:
tokens = self.word_tokenizer.tokenize(text,
never_split=self.all_special_tokens)
tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens)
else:
tokens = [text]
if self.do_subword_tokenize:
split_tokens = [sub_token for token in tokens
for sub_token in self.subword_tokenizer.tokenize(token)]
split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)]
else:
split_tokens = tokens
......@@ -177,27 +184,28 @@ class MecabTokenizer(object):
self.normalize_text = normalize_text
import MeCab
self.mecab = MeCab.Tagger()
def tokenize(self, text, never_split=None, **kwargs):
"""Tokenizes a piece of text."""
if self.normalize_text:
text = unicodedata.normalize('NFKC', text)
text = unicodedata.normalize("NFKC", text)
never_split = self.never_split + (never_split if never_split is not None else [])
tokens = []
if six.PY2:
mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8')
mecab_output = self.mecab.parse(text.encode("utf-8")).decode("utf-8")
else:
mecab_output = self.mecab.parse(text)
cursor = 0
for line in mecab_output.split('\n'):
if line == 'EOS':
for line in mecab_output.split("\n"):
if line == "EOS":
break
token, _ = line.split('\t')
token, _ = line.split("\t")
token_start = text.index(token, cursor)
token_end = token_start + len(token)
if self.do_lower_case and token not in never_split:
......@@ -240,7 +248,7 @@ class CharacterTokenizer(object):
A list of characters.
"""
if self.normalize_text:
text = unicodedata.normalize('NFKC', text)
text = unicodedata.normalize("NFKC", text)
output_tokens = []
for i, char in enumerate(text):
......
......@@ -13,32 +13,34 @@
# See the License for the specific language governing permissions and
# limitations under the License
""" Tokenization classes for Camembert model."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
from shutil import copyfile
import sentencepiece as spm
from transformers.tokenization_utils import PreTrainedTokenizer
from .tokenization_xlnet import SPIECE_UNDERLINE
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'camembert-base': "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
"vocab_file": {
"camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-sentencepiece.bpe.model",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'camembert-base': None,
"camembert-base": None,
}
class CamembertTokenizer(PreTrainedTokenizer):
"""
Adapted from RobertaTokenizer and XLNetTokenizer
......@@ -46,17 +48,36 @@ class CamembertTokenizer(PreTrainedTokenizer):
- requires `SentencePiece <https://github.com/google/sentencepiece>`_
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
additional_special_tokens=['<s>NOTUSED', '</s>NOTUSED'], **kwargs):
super(CamembertTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
mask_token=mask_token, additional_special_tokens=additional_special_tokens,
**kwargs)
def __init__(
self,
vocab_file,
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
**kwargs
):
super(CamembertTokenizer, self).__init__(
max_len=512,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
**kwargs
)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
self.sp_model = spm.SentencePieceProcessor()
......@@ -64,9 +85,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
self.vocab_file = vocab_file
# HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
# sentencepiece vocabulary (this is the case for <s> and </s>
self.fairseq_tokens_to_ids = {'<s>NOTUSED': 0, '<pad>': 1, '</s>NOTUSED': 2, '<unk>': 3}
self.fairseq_tokens_to_ids = {"<s>NOTUSED": 0, "<pad>": 1, "</s>NOTUSED": 2, "<unk>": 3}
self.fairseq_offset = len(self.fairseq_tokens_to_ids)
self.fairseq_tokens_to_ids['<mask>'] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
......@@ -100,8 +121,10 @@ class CamembertTokenizer(PreTrainedTokenizer):
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
......@@ -148,7 +171,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
return out_string
def save_vocabulary(self, save_directory):
......@@ -158,7 +181,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
......
......@@ -13,37 +13,32 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for Salesforce CTRL."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import os
import regex as re
from io import open
import regex as re
from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {
'vocab_file': 'vocab.json',
'merges_file': 'merges.txt',
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json",
},
'merges_file':
{
'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt",
},
"vocab_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json"},
"merges_file": {"ctrl": "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt"},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'ctrl': 256,
"ctrl": 256,
}
CONTROL_CODES = {
......@@ -104,6 +99,7 @@ CONTROL_CODES = {
"multilingual": 128406,
}
def get_pairs(word):
"""Return set of symbol pairs in a word.
......@@ -118,11 +114,13 @@ def get_pairs(word):
pairs = set(pairs)
return pairs
class CTRLTokenizer(PreTrainedTokenizer):
"""
CTRL BPE tokenizer. Peculiarities:
- Byte-Pair-Encoding
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
......@@ -130,14 +128,18 @@ class CTRLTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs)
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_single_sentence = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v:k for k,v in self.encoder.items()}
with open(merges_file, encoding='utf-8') as merges_handle:
merges = merges_handle.read().split('\n')[1:-1]
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
......@@ -150,14 +152,14 @@ class CTRLTokenizer(PreTrainedTokenizer):
if token in self.cache:
return self.cache[token]
word = tuple(token)
word = tuple(list(word[:-1]) + [word[-1]+'</w>'])
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
......@@ -166,14 +168,15 @@ class CTRLTokenizer(PreTrainedTokenizer):
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
......@@ -184,7 +187,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
break
else:
pairs = get_pairs(word)
word = '@@ '.join(word)
word = "@@ ".join(word)
word = word[:-4]
self.cache[token] = word
return word
......@@ -194,10 +197,10 @@ class CTRLTokenizer(PreTrainedTokenizer):
"""
split_tokens = []
words = re.findall(r'\S+\n?', text)
words = re.findall(r"\S+\n?", text)
for token in words:
split_tokens.extend([t for t in self.bpe(token).split(' ')])
split_tokens.extend([t for t in self.bpe(token).split(" ")])
return split_tokens
def _convert_token_to_id(self, token):
......@@ -210,7 +213,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
out_string = ' '.join(tokens).replace('@@ ', '').strip()
out_string = " ".join(tokens).replace("@@ ", "").strip()
return out_string
def save_vocabulary(self, save_directory):
......@@ -218,21 +221,23 @@ class CTRLTokenizer(PreTrainedTokenizer):
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
with open(vocab_file, 'w', encoding='utf-8') as f:
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write(u'#version: 0.2\n')
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file))
logger.warning(
"Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file)
)
index = token_index
writer.write(' '.join(bpe_tokens) + u'\n')
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
......
......@@ -16,33 +16,29 @@
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import logging
import os
import unicodedata
from io import open
from .tokenization_bert import BertTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
'distilbert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
'distilbert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
"vocab_file": {
"distilbert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
"distilbert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-german-cased-vocab.txt",
"distilbert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'distilbert-base-uncased': 512,
'distilbert-base-uncased-distilled-squad': 512,
'distilbert-base-german-cased': 512,
'distilbert-base-multilingual-cased': 512,
"distilbert-base-uncased": 512,
"distilbert-base-uncased-distilled-squad": 512,
"distilbert-base-german-cased": 512,
"distilbert-base-multilingual-cased": 512,
}
......
......@@ -13,16 +13,19 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
import json
import logging
import os
import regex as re
import sys
from io import open
import regex as re
from .tokenization_utils import PreTrainedTokenizer
try:
from functools import lru_cache
except ImportError:
......@@ -31,42 +34,40 @@ except ImportError:
def lru_cache():
return lambda func: func
from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {
'vocab_file': 'vocab.json',
'merges_file': 'merges.txt',
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
"vocab_file": {
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
"gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-vocab.json",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
},
'merges_file':
{
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
'gpt2-xl': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
"merges_file": {
"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
"gpt2-xl": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-xl-merges.txt",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'gpt2': 1024,
'gpt2-medium': 1024,
'gpt2-large': 1024,
'gpt2-xl': 1024,
'distilgpt2': 1024,
"gpt2": 1024,
"gpt2-medium": 1024,
"gpt2-large": 1024,
"gpt2-xl": 1024,
"distilgpt2": 1024,
}
@lru_cache()
def bytes_to_unicode():
"""
......@@ -79,18 +80,21 @@ def bytes_to_unicode():
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
"""
_chr = unichr if sys.version_info[0] == 2 else chr
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
_chr = unichr if sys.version_info[0] == 2 else chr # noqa: F821
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
for b in range(2 ** 8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
cs.append(2 ** 8 + n)
n += 1
cs = [_chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""Return set of symbol pairs in a word.
......@@ -103,6 +107,7 @@ def get_pairs(word):
prev_char = char
return pairs
class GPT2Tokenizer(PreTrainedTokenizer):
"""
GPT-2 BPE tokenizer. Peculiarities:
......@@ -112,15 +117,28 @@ class GPT2Tokenizer(PreTrainedTokenizer):
Otherwise, this tokenizer's ``encode``, ``decode``, and ``tokenize`` methods will not conserve
the spaces at the beginning of a string: `tokenizer.decode(tokenizer.encode(" Hello")) = "Hello"`
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
unk_token="<|endoftext|>",
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
**kwargs
):
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_single_sentence = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
......@@ -128,8 +146,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
with open(merges_file, encoding='utf-8') as merges_handle:
bpe_merges = merges_handle.read().split('\n')[1:-1]
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
......@@ -151,7 +169,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
return token
while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
......@@ -160,14 +178,15 @@ class GPT2Tokenizer(PreTrainedTokenizer):
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
......@@ -178,7 +197,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
word = " ".join(word)
self.cache[token] = word
return word
......@@ -189,15 +208,19 @@ class GPT2Tokenizer(PreTrainedTokenizer):
Begin the sentence with at least one space to get invariance to word order in GPT-2 (and RoBERTa) tokenizers.
"""
if add_prefix_space:
text = ' ' + text
text = " " + text
bpe_tokens = []
for token in re.findall(self.pat, text):
if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
token = "".join(
self.byte_encoder[ord(b)] for b in token
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
return bpe_tokens
def _convert_token_to_id(self, token):
......@@ -210,8 +233,8 @@ class GPT2Tokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
text = ''.join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
text = "".join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def save_vocabulary(self, save_directory):
......@@ -219,21 +242,23 @@ class GPT2Tokenizer(PreTrainedTokenizer):
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
with open(vocab_file, 'w', encoding='utf-8') as f:
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write(u'#version: 0.2\n')
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file))
logger.warning(
"Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file)
)
index = token_index
writer.write(' '.join(bpe_tokens) + u'\n')
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
......@@ -13,8 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
......@@ -22,31 +21,27 @@ import os
import re
from io import open
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_bert import BasicTokenizer
from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {
'vocab_file': 'vocab.json',
'merges_file': 'merges.txt',
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
},
'merges_file':
{
'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
},
"vocab_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"},
"merges_file": {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'openai-gpt': 512,
"openai-gpt": 512,
}
def get_pairs(word):
"""
Return set of symbol pairs in a word.
......@@ -59,27 +54,30 @@ def get_pairs(word):
prev_char = char
return pairs
def text_standardize(text):
"""
fixes some issues the spacy tokenizer had on books corpus
also does some whitespace standardization
"""
text = text.replace('—', '-')
text = text.replace('–', '-')
text = text.replace('―', '-')
text = text.replace('…', '...')
text = text.replace('´', "'")
text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
text = re.sub(r'\s*\n\s*', ' \n ', text)
text = re.sub(r'[^\S\n]+', ' ', text)
text = text.replace("—", "-")
text = text.replace("–", "-")
text = text.replace("―", "-")
text = text.replace("…", "...")
text = text.replace("´", "'")
text = re.sub(r"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text)
text = re.sub(r"\s*\n\s*", " \n ", text)
text = re.sub(r"[^\S\n]+", " ", text)
return text.strip()
class OpenAIGPTTokenizer(PreTrainedTokenizer):
"""
BPE tokenizer. Peculiarities:
- lower case all inputs
- uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
......@@ -87,12 +85,17 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_single_sentence = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
try:
import ftfy
from spacy.lang.en import English
_nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text
......@@ -103,9 +106,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v:k for k,v in self.encoder.items()}
with open(merges_file, encoding='utf-8') as merges_handle:
merges = merges_handle.read().split('\n')[1:-1]
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
......@@ -115,16 +118,16 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
return len(self.encoder)
def bpe(self, token):
word = tuple(token[:-1]) + (token[-1] + '</w>',)
word = tuple(token[:-1]) + (token[-1] + "</w>",)
if token in self.cache:
return self.cache[token]
pairs = get_pairs(word)
if not pairs:
return token+'</w>'
return token + "</w>"
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
......@@ -133,14 +136,15 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
......@@ -151,9 +155,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
if word == '\n </w>':
word = '\n</w>'
word = " ".join(word)
if word == "\n </w>":
word = "\n</w>"
self.cache[token] = word
return word
......@@ -164,12 +168,12 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
# Using BERT's BasicTokenizer
text = self.nlp.tokenize(text)
for token in text:
split_tokens.extend([t for t in self.bpe(token).split(' ')])
split_tokens.extend([t for t in self.bpe(token).split(" ")])
else:
# Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
text = self.nlp(text_standardize(self.fix_text(text)))
for token in text:
split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
split_tokens.extend([t for t in self.bpe(token.text.lower()).split(" ")])
return split_tokens
def _convert_token_to_id(self, token):
......@@ -182,7 +186,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
out_string = ''.join(tokens).replace('</w>', ' ').strip()
out_string = "".join(tokens).replace("</w>", " ").strip()
return out_string
def save_vocabulary(self, save_directory):
......@@ -190,21 +194,23 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
with open(vocab_file, 'w', encoding='utf-8') as f:
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write(u'#version: 0.2\n')
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file))
logger.warning(
"Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file)
)
index = token_index
writer.write(' '.join(bpe_tokens) + u'\n')
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
......@@ -13,18 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for RoBERTa."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
import json
import logging
import os
import regex as re
from io import open
from .tokenization_gpt2 import GPT2Tokenizer
try:
from functools import lru_cache
except ImportError:
......@@ -33,41 +28,40 @@ except ImportError:
def lru_cache():
return lambda func: func
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {
'vocab_file': 'vocab.json',
'merges_file': 'merges.txt',
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
"vocab_file": {
"roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
"roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
"roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
"distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
"roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
"roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
},
'merges_file':
{
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
'roberta-base-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
'roberta-large-openai-detector': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
"merges_file": {
"roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
"roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
"roberta-large-mnli": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
"distilroberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
"roberta-base-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
"roberta-large-openai-detector": "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'roberta-base': 512,
'roberta-large': 512,
'roberta-large-mnli': 512,
'distilroberta-base': 512,
'roberta-base-openai-detector': 512,
'roberta-large-openai-detector': 512,
"roberta-base": 512,
"roberta-large": 512,
"roberta-large-mnli": 512,
"distilroberta-base": 512,
"roberta-base-openai-detector": 512,
"roberta-large-openai-detector": 512,
}
......@@ -80,16 +74,38 @@ class RobertaTokenizer(GPT2Tokenizer):
Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
mask_token=mask_token, **kwargs)
def __init__(
self,
vocab_file,
merges_file,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
**kwargs
):
super(RobertaTokenizer, self).__init__(
vocab_file=vocab_file,
merges_file=merges_file,
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
**kwargs
)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
......@@ -124,8 +140,10 @@ class RobertaTokenizer(GPT2Tokenizer):
"""
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is None:
......
......@@ -19,33 +19,34 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import logging
import os
import re
import six
from shutil import copyfile
import six
from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__)
SPIECE_UNDERLINE = u'▁'
SPIECE_UNDERLINE = "▁"
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model shortcut names.
####################################################
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
"vocab_file": {
"t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
"t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
"t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
"t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
"t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
}
}
......@@ -53,13 +54,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
# Mapping from model shortcut names to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
't5-small': 512,
't5-base': 512,
't5-large': 512,
't5-3b': 512,
't5-11b': 512,
"t5-small": 512,
"t5-base": 512,
"t5-large": 512,
"t5-3b": 512,
"t5-11b": 512,
}
class T5Tokenizer(PreTrainedTokenizer):
"""
SentencePiece based tokenizer. Peculiarities:
......@@ -71,28 +73,43 @@ class T5Tokenizer(PreTrainedTokenizer):
(like in T5 preprocessing
see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, eos_token="</s>", unk_token="<unk>",
pad_token="<pad>", extra_ids=100, additional_special_tokens=None, **kwargs):
def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
pad_token="<pad>",
extra_ids=100,
additional_special_tokens=None,
**kwargs
):
# Add extra_ids to the special token list
if extra_ids > 0:
if additional_special_tokens is None:
additional_special_tokens = []
additional_special_tokens.extend([u"<extra_id_{}>".format(i) for i in range(extra_ids)])
additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(extra_ids)])
super(T5Tokenizer, self).__init__(eos_token=eos_token, unk_token=unk_token,
pad_token=pad_token, additional_special_tokens=additional_special_tokens,
**kwargs)
super(T5Tokenizer, self).__init__(
eos_token=eos_token,
unk_token=unk_token,
pad_token=pad_token,
additional_special_tokens=additional_special_tokens,
**kwargs
)
try:
import sentencepiece as spm
except ImportError:
logger.warning("You need to install SentencePiece to use T5Tokenizer:"
logger.warning(
"You need to install SentencePiece to use T5Tokenizer:"
"https://github.com/google/sentencepiece"
"pip install sentencepiece")
"pip install sentencepiece"
)
self.vocab_file = vocab_file
self._extra_ids = extra_ids
......@@ -114,8 +131,10 @@ class T5Tokenizer(PreTrainedTokenizer):
try:
import sentencepiece as spm
except ImportError:
logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece")
logger.warning(
"You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece"
)
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
......@@ -132,7 +151,7 @@ class T5Tokenizer(PreTrainedTokenizer):
ret_pieces = []
for piece in pieces:
if isinstance(piece, str):
piece = piece.decode('utf-8')
piece = piece.decode("utf-8")
ret_pieces.append(piece)
pieces = ret_pieces
......@@ -140,9 +159,9 @@ class T5Tokenizer(PreTrainedTokenizer):
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
if token.startswith(u"<extra_id_"):
l = re.match(r'<extra_id_(\d+)>', token)
num = int(l.group(1))
if token.startswith("<extra_id_"):
match = re.match(r"<extra_id_(\d+)>", token)
num = int(match.group(1))
return self.vocab_size - num - 1
return self.sp_model.piece_to_id(token)
......@@ -151,9 +170,9 @@ class T5Tokenizer(PreTrainedTokenizer):
if index < self.sp_model.get_piece_size():
token = self.sp_model.IdToPiece(index)
else:
token = u"<extra_id_{}>".format(self.vocab_size - 1 - index)
token = "<extra_id_{}>".format(self.vocab_size - 1 - index)
if six.PY2 and return_unicode and isinstance(token, str):
token = token.decode('utf-8')
token = token.decode("utf-8")
return token
def convert_tokens_to_string(self, tokens):
......@@ -168,7 +187,7 @@ class T5Tokenizer(PreTrainedTokenizer):
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
copyfile(self.vocab_file, out_vocab_file)
......
......@@ -16,8 +16,7 @@
""" Tokenization classes for Transformer XL model.
Adapted from https://github.com/kimiyoung/transformer-xl.
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import glob
import logging
......@@ -31,55 +30,72 @@ import numpy as np
from .file_utils import cached_path
from .tokenization_utils import PreTrainedTokenizer
try:
import torch
except ImportError:
pass
# if sys.version_info[0] == 2:
# import cPickle as pickle
# else:
# import pickle
if sys.version_info[0] == 2:
import cPickle as pickle
else:
import pickle
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin', 'vocab_file': 'vocab.txt'}
VOCAB_FILES_NAMES = {"pretrained_vocab_file": "vocab.bin", "vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
'pretrained_vocab_file':
{
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
"pretrained_vocab_file": {
"transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'transfo-xl-wt103': None,
"transfo-xl-wt103": None,
}
PRETRAINED_CORPUS_ARCHIVE_MAP = {
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
"transfo-xl-wt103": "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin",
}
CORPUS_NAME = 'corpus.bin'
CORPUS_NAME = "corpus.bin"
class TransfoXLTokenizer(PreTrainedTokenizer):
"""
Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False,
delimiter=None, vocab_file=None, pretrained_vocab_file=None,
never_split=None, unk_token="<unk>", eos_token="<eos>",
additional_special_tokens=["<formula>"], **kwargs):
super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
additional_special_tokens=additional_special_tokens,
**kwargs)
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
def __init__(
self,
special=None,
min_freq=0,
max_size=None,
lower_case=False,
delimiter=None,
vocab_file=None,
pretrained_vocab_file=None,
never_split=None,
unk_token="<unk>",
eos_token="<eos>",
additional_special_tokens=["<formula>"],
**kwargs
):
super(TransfoXLTokenizer, self).__init__(
unk_token=unk_token, eos_token=eos_token, additional_special_tokens=additional_special_tokens, **kwargs
)
self.max_len_single_sentence = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = (
self.max_len
) # no default special tokens - you can update this value if you add special tokens
if never_split is None:
never_split = self.all_special_tokens
......@@ -106,14 +122,15 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
self.build_vocab()
def count_file(self, path, verbose=False, add_eos=False):
if verbose: logger.info('counting file {} ...'.format(path))
if verbose:
logger.info("counting file {} ...".format(path))
assert os.path.exists(path)
sents = []
with open(path, 'r', encoding='utf-8') as f:
with open(path, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0:
logger.info(' line {}'.format(idx))
logger.info(" line {}".format(idx))
symbols = self.tokenize(line, add_eos=add_eos)
self.counter.update(symbols)
sents.append(symbols)
......@@ -124,42 +141,42 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
"""
sents : a list of sentences, each a list of tokenized symbols
"""
if verbose: logger.info('counting {} sents ...'.format(len(sents)))
if verbose:
logger.info("counting {} sents ...".format(len(sents)))
for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0:
logger.info(' line {}'.format(idx))
logger.info(" line {}".format(idx))
self.counter.update(symbols)
def _build_from_file(self, vocab_file):
self.idx2sym = []
self.sym2idx = OrderedDict()
with open(vocab_file, 'r', encoding='utf-8') as f:
with open(vocab_file, "r", encoding="utf-8") as f:
for line in f:
symb = line.strip().split()[0]
self.add_symbol(symb)
if '<UNK>' in self.sym2idx:
self.unk_idx = self.sym2idx['<UNK>']
elif '<unk>' in self.sym2idx:
self.unk_idx = self.sym2idx['<unk>']
if "<UNK>" in self.sym2idx:
self.unk_idx = self.sym2idx["<UNK>"]
elif "<unk>" in self.sym2idx:
self.unk_idx = self.sym2idx["<unk>"]
else:
raise ValueError('No <unkown> token in vocabulary')
raise ValueError("No <unkown> token in vocabulary")
def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file."""
if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file'])
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["pretrained_vocab_file"])
torch.save(self.__dict__, vocab_file)
return (vocab_file,)
def build_vocab(self):
if self.vocab_file:
logger.info('building vocab from {}'.format(self.vocab_file))
logger.info("building vocab from {}".format(self.vocab_file))
self._build_from_file(self.vocab_file)
logger.info('final vocab size {}'.format(len(self)))
logger.info("final vocab size {}".format(len(self)))
else:
logger.info('building vocab with min_freq={}, max_size={}'.format(
self.min_freq, self.max_size))
logger.info("building vocab with min_freq={}, max_size={}".format(self.min_freq, self.max_size))
self.idx2sym = []
self.sym2idx = OrderedDict()
......@@ -167,23 +184,22 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
self.add_special(sym)
for sym, cnt in self.counter.most_common(self.max_size):
if cnt < self.min_freq: break
if cnt < self.min_freq:
break
self.add_symbol(sym)
logger.info('final vocab size {} from {} unique tokens'.format(
len(self), len(self.counter)))
logger.info("final vocab size {} from {} unique tokens".format(len(self), len(self.counter)))
def encode_file(self, path, ordered=False, verbose=False, add_eos=True,
add_double_eos=False):
if verbose: logger.info('encoding file {} ...'.format(path))
def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
if verbose:
logger.info("encoding file {} ...".format(path))
assert os.path.exists(path)
encoded = []
with open(path, 'r', encoding='utf-8') as f:
with open(path, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
if verbose and idx > 0 and idx % 500000 == 0:
logger.info(' line {}'.format(idx))
symbols = self.tokenize(line, add_eos=add_eos,
add_double_eos=add_double_eos)
logger.info(" line {}".format(idx))
symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)
encoded.append(self.convert_to_tensor(symbols))
if ordered:
......@@ -192,11 +208,12 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
return encoded
def encode_sents(self, sents, ordered=False, verbose=False):
if verbose: logger.info('encoding {} sents ...'.format(len(sents)))
if verbose:
logger.info("encoding {} sents ...".format(len(sents)))
encoded = []
for idx, symbols in enumerate(sents):
if verbose and idx > 0 and idx % 500000 == 0:
logger.info(' line {}'.format(idx))
logger.info(" line {}".format(idx))
encoded.append(self.convert_to_tensor(symbols))
if ordered:
......@@ -208,7 +225,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
if sym not in self.sym2idx:
self.idx2sym.append(sym)
self.sym2idx[sym] = len(self.idx2sym) - 1
setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym])
setattr(self, "{}_idx".format(sym.strip("<>")), self.sym2idx[sym])
def add_symbol(self, sym):
if sym not in self.sym2idx:
......@@ -217,7 +234,7 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
def _convert_id_to_token(self, idx):
"""Converts an id in a token (BPE) using the vocab."""
assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx)
assert 0 <= idx < len(self), "Index {} out of vocabulary range".format(idx)
return self.idx2sym[idx]
def _convert_token_to_id(self, sym):
......@@ -227,19 +244,19 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
else:
# logger.info('encounter unk {}'.format(sym))
# assert '<eos>' not in sym
if hasattr(self, 'unk_idx'):
if hasattr(self, "unk_idx"):
return self.sym2idx.get(sym, self.unk_idx)
# Backward compatibility with pre-trained models
elif '<unk>' in self.sym2idx:
return self.sym2idx['<unk>']
elif '<UNK>' in self.sym2idx:
return self.sym2idx['<UNK>']
elif "<unk>" in self.sym2idx:
return self.sym2idx["<unk>"]
elif "<UNK>" in self.sym2idx:
return self.sym2idx["<UNK>"]
else:
raise ValueError('Token not in vocabulary and no <unk> token in vocabulary for replacement')
raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement")
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
out_string = ' '.join(tokens).strip()
out_string = " ".join(tokens).strip()
return out_string
def convert_to_tensor(self, symbols):
......@@ -256,21 +273,21 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
line = line.lower()
# empty delimiter '' will evaluate False
if self.delimiter == '':
if self.delimiter == "":
symbols = line
else:
symbols = line.split(self.delimiter)
if add_double_eos: # lm1b
return ['<S>'] + symbols + ['<S>']
return ["<S>"] + symbols + ["<S>"]
elif add_eos:
return symbols + ['<eos>']
return symbols + ["<eos>"]
else:
return symbols
class LMOrderedIterator(object):
def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
"""
data -- LongTensor -- the LongTensor is strictly ordered
"""
......@@ -293,14 +310,15 @@ class LMOrderedIterator(object):
self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
def get_batch(self, i, bptt=None):
if bptt is None: bptt = self.bptt
if bptt is None:
bptt = self.bptt
seq_len = min(bptt, self.data.size(0) - 1 - i)
end_idx = i + seq_len
beg_idx = max(0, i - self.ext_len)
data = self.data[beg_idx:end_idx]
target = self.data[i+1:i+1+seq_len]
target = self.data[i + 1 : i + 1 + seq_len]
data_out = data.transpose(0, 1).contiguous().to(self.device)
target_out = target.transpose(0, 1).contiguous().to(self.device)
......@@ -315,7 +333,7 @@ class LMOrderedIterator(object):
max_len = self.bptt + max_deviation * std
i = start
while True:
bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0
bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
data, target, seq_len = self.get_batch(i, bptt)
i += seq_len
......@@ -328,7 +346,7 @@ class LMOrderedIterator(object):
class LMShuffledIterator(object):
def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False):
def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
"""
data -- list[LongTensor] -- there is no order among the LongTensors
"""
......@@ -343,8 +361,7 @@ class LMShuffledIterator(object):
def get_sent_stream(self):
# index iterator
epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \
else np.array(range(len(self.data)))
epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data)))
# sentence iterator
for idx in epoch_indices:
......@@ -376,10 +393,8 @@ class LMShuffledIterator(object):
# number of new tokens to fill in
n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
# first n_retain tokens are retained from last batch
data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \
streams[i][:n_new]
target[n_filled:n_filled+n_new, i] = \
streams[i][1:n_new+1]
data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new]
target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1]
streams[i] = streams[i][n_new:]
n_filled += n_new
except StopIteration:
......@@ -408,8 +423,7 @@ class LMShuffledIterator(object):
class LMMultiFileIterator(LMShuffledIterator):
def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None,
shuffle=False):
def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
self.paths = paths
self.vocab = vocab
......@@ -460,15 +474,16 @@ class TransfoXLCorpus(object):
"We assumed '{}' was a path or url but couldn't find files {} "
"at this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
", ".join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()),
pretrained_model_name_or_path,
corpus_file))
corpus_file,
)
)
return None
if resolved_corpus_file == corpus_file:
logger.info("loading corpus file {}".format(corpus_file))
else:
logger.info("loading corpus file {} from cache at {}".format(
corpus_file, resolved_corpus_file))
logger.info("loading corpus file {} from cache at {}".format(corpus_file, resolved_corpus_file))
# Instantiate tokenizer.
corpus = cls(*inputs, **kwargs)
......@@ -494,83 +509,78 @@ class TransfoXLCorpus(object):
def build_corpus(self, path, dataset):
self.dataset = dataset
if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']:
self.vocab.count_file(os.path.join(path, 'train.txt'))
self.vocab.count_file(os.path.join(path, 'valid.txt'))
self.vocab.count_file(os.path.join(path, 'test.txt'))
elif self.dataset == 'wt103':
self.vocab.count_file(os.path.join(path, 'train.txt'))
elif self.dataset == 'lm1b':
if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
self.vocab.count_file(os.path.join(path, "train.txt"))
self.vocab.count_file(os.path.join(path, "valid.txt"))
self.vocab.count_file(os.path.join(path, "test.txt"))
elif self.dataset == "wt103":
self.vocab.count_file(os.path.join(path, "train.txt"))
elif self.dataset == "lm1b":
train_path_pattern = os.path.join(
path, '1-billion-word-language-modeling-benchmark-r13output',
'training-monolingual.tokenized.shuffled', 'news.en-*')
path,
"1-billion-word-language-modeling-benchmark-r13output",
"training-monolingual.tokenized.shuffled",
"news.en-*",
)
train_paths = glob.glob(train_path_pattern)
# the vocab will load from file when build_vocab() is called
self.vocab.build_vocab()
if self.dataset in ['ptb', 'wt2', 'wt103']:
self.train = self.vocab.encode_file(
os.path.join(path, 'train.txt'), ordered=True)
self.valid = self.vocab.encode_file(
os.path.join(path, 'valid.txt'), ordered=True)
self.test = self.vocab.encode_file(
os.path.join(path, 'test.txt'), ordered=True)
elif self.dataset in ['enwik8', 'text8']:
self.train = self.vocab.encode_file(
os.path.join(path, 'train.txt'), ordered=True, add_eos=False)
self.valid = self.vocab.encode_file(
os.path.join(path, 'valid.txt'), ordered=True, add_eos=False)
self.test = self.vocab.encode_file(
os.path.join(path, 'test.txt'), ordered=True, add_eos=False)
elif self.dataset == 'lm1b':
if self.dataset in ["ptb", "wt2", "wt103"]:
self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)
elif self.dataset in ["enwik8", "text8"]:
self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False)
self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False)
elif self.dataset == "lm1b":
self.train = train_paths
self.valid = self.vocab.encode_file(
os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True)
self.test = self.vocab.encode_file(
os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True)
self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True)
self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True)
def get_iterator(self, split, *args, **kwargs):
if split == 'train':
if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
if split == "train":
if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
data_iter = LMOrderedIterator(self.train, *args, **kwargs)
elif self.dataset == 'lm1b':
kwargs['shuffle'] = True
elif self.dataset == "lm1b":
kwargs["shuffle"] = True
data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
elif split in ['valid', 'test']:
data = self.valid if split == 'valid' else self.test
if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']:
elif split in ["valid", "test"]:
data = self.valid if split == "valid" else self.test
if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
data_iter = LMOrderedIterator(data, *args, **kwargs)
elif self.dataset == 'lm1b':
elif self.dataset == "lm1b":
data_iter = LMShuffledIterator(data, *args, **kwargs)
return data_iter
def get_lm_corpus(datadir, dataset):
fn = os.path.join(datadir, 'cache.pt')
fn_pickle = os.path.join(datadir, 'cache.pkl')
fn = os.path.join(datadir, "cache.pt")
fn_pickle = os.path.join(datadir, "cache.pkl")
if os.path.exists(fn):
logger.info('Loading cached dataset...')
logger.info("Loading cached dataset...")
corpus = torch.load(fn_pickle)
elif os.path.exists(fn):
logger.info('Loading cached dataset from pickle...')
logger.info("Loading cached dataset from pickle...")
with open(fn, "rb") as fp:
corpus = pickle.load(fp)
else:
logger.info('Producing dataset {}...'.format(dataset))
logger.info("Producing dataset {}...".format(dataset))
kwargs = {}
if dataset in ['wt103', 'wt2']:
kwargs['special'] = ['<eos>']
kwargs['lower_case'] = False
elif dataset == 'ptb':
kwargs['special'] = ['<eos>']
kwargs['lower_case'] = True
elif dataset == 'lm1b':
kwargs['special'] = []
kwargs['lower_case'] = False
kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt')
elif dataset in ['enwik8', 'text8']:
if dataset in ["wt103", "wt2"]:
kwargs["special"] = ["<eos>"]
kwargs["lower_case"] = False
elif dataset == "ptb":
kwargs["special"] = ["<eos>"]
kwargs["lower_case"] = True
elif dataset == "lm1b":
kwargs["special"] = []
kwargs["lower_case"] = False
kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt")
elif dataset in ["enwik8", "text8"]:
pass
corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
......
......@@ -13,19 +13,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import os
import json
import six
import copy
import itertools
import json
import logging
import os
import re
from io import open
from .file_utils import cached_path, is_remote_url, hf_bucket_url, is_tf_available, is_torch_available
import six
from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
if is_tf_available():
import tensorflow as tf
......@@ -34,9 +35,10 @@ if is_torch_available():
logger = logging.getLogger(__name__)
SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
ADDED_TOKENS_FILE = 'added_tokens.json'
TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
class PreTrainedTokenizer(object):
""" Base class for all tokenizers.
......@@ -69,14 +71,22 @@ class PreTrainedTokenizer(object):
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
"""
vocab_files_names = {}
pretrained_vocab_files_map = {}
pretrained_init_configuration = {}
max_model_input_sizes = {}
SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
"pad_token", "cls_token", "mask_token",
"additional_special_tokens"]
SPECIAL_TOKENS_ATTRIBUTES = [
"bos_token",
"eos_token",
"unk_token",
"sep_token",
"pad_token",
"cls_token",
"mask_token",
"additional_special_tokens",
]
padding_side = "right"
......@@ -227,7 +237,7 @@ class PreTrainedTokenizer(object):
self.max_len = max_len if max_len is not None else int(1e12)
# Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
self.padding_side = kwargs.pop('padding_side', self.padding_side)
self.padding_side = kwargs.pop("padding_side", self.padding_side)
# Added tokens
self.added_tokens_encoder = {}
......@@ -240,13 +250,14 @@ class PreTrainedTokenizer(object):
for key, value in kwargs.items():
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
if key == 'additional_special_tokens':
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
if key == "additional_special_tokens":
assert isinstance(value, (list, tuple)) and all(
isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821
)
else:
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821
setattr(self, key, value)
@classmethod
def from_pretrained(cls, *inputs, **kwargs):
r"""
......@@ -302,13 +313,12 @@ class PreTrainedTokenizer(object):
"""
return cls._from_pretrained(*inputs, **kwargs)
@classmethod
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
cache_dir = kwargs.pop('cache_dir', None)
force_download = kwargs.pop('force_download', False)
resume_download = kwargs.pop('resume_download', False)
proxies = kwargs.pop('proxies', None)
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
s3_models = list(cls.max_model_input_sizes.keys())
vocab_files = {}
......@@ -317,15 +327,19 @@ class PreTrainedTokenizer(object):
# Get the vocabulary from AWS S3 bucket
for file_id, map_list in cls.pretrained_vocab_files_map.items():
vocab_files[file_id] = map_list[pretrained_model_name_or_path]
if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
if (
cls.pretrained_init_configuration
and pretrained_model_name_or_path in cls.pretrained_init_configuration
):
init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path]
else:
# Get the vocabulary from local files
logger.info(
"Model name '{}' not found in model shortcut name list ({}). "
"Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
pretrained_model_name_or_path, ', '.join(s3_models),
pretrained_model_name_or_path))
pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
)
)
# Look for the tokenizer main vocabulary files
for file_id, file_name in cls.vocab_files_names.items():
......@@ -344,9 +358,10 @@ class PreTrainedTokenizer(object):
vocab_files[file_id] = full_file_name
# Look for the additional tokens files
additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE,
'tokenizer_config_file': TOKENIZER_CONFIG_FILE,
additional_files_names = {
"added_tokens_file": ADDED_TOKENS_FILE,
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
}
# If a path to a file was provided, get the parent directory
......@@ -366,9 +381,12 @@ class PreTrainedTokenizer(object):
"Model name '{}' was not found in tokenizers model name list ({}). "
"We assumed '{}' was a path or url to a directory containing vocabulary files "
"named {} but couldn't find such vocabulary files at this path or url.".format(
pretrained_model_name_or_path, ', '.join(s3_models),
pretrained_model_name_or_path,
list(cls.vocab_files_names.values())))
", ".join(s3_models),
pretrained_model_name_or_path,
list(cls.vocab_files_names.values()),
)
)
# Get files from url, cache, or disk depending on the case
try:
......@@ -377,17 +395,27 @@ class PreTrainedTokenizer(object):
if file_path is None:
resolved_vocab_files[file_id] = None
else:
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download)
resolved_vocab_files[file_id] = cached_path(
file_path,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
)
except EnvironmentError:
if pretrained_model_name_or_path in s3_models:
msg = "Couldn't reach server at '{}' to download vocabulary files."
else:
msg = "Model name '{}' was not found in tokenizers model name list ({}). " \
"We assumed '{}' was a path or url to a directory containing vocabulary files " \
msg = (
"Model name '{}' was not found in tokenizers model name list ({}). "
"We assumed '{}' was a path or url to a directory containing vocabulary files "
"named {}, but couldn't find such vocabulary files at this path or url.".format(
pretrained_model_name_or_path, ', '.join(s3_models),
pretrained_model_name_or_path,
list(cls.vocab_files_names.values()))
", ".join(s3_models),
pretrained_model_name_or_path,
list(cls.vocab_files_names.values()),
)
)
raise EnvironmentError(msg)
......@@ -395,16 +423,15 @@ class PreTrainedTokenizer(object):
if file_path == resolved_vocab_files[file_id]:
logger.info("loading file {}".format(file_path))
else:
logger.info("loading file {} from cache at {}".format(
file_path, resolved_vocab_files[file_id]))
logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
# Prepare tokenizer initialization kwargs
# Did we saved some inputs and kwargs to reload ?
tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
if tokenizer_config_file is not None:
with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
init_kwargs = json.load(tokenizer_config_handle)
saved_init_inputs = init_kwargs.pop('init_inputs', ())
saved_init_inputs = init_kwargs.pop("init_inputs", ())
if not init_inputs:
init_inputs = saved_init_inputs
else:
......@@ -419,11 +446,11 @@ class PreTrainedTokenizer(object):
# wont index sequences longer than the number of positional embeddings
max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
if max_len is not None and isinstance(max_len, (int, float)):
init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len)
init_kwargs["max_len"] = min(init_kwargs.get("max_len", int(1e12)), max_len)
# Merge resolved_vocab_files arguments in init_kwargs.
added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
for args_name, file_path in resolved_vocab_files.items():
if args_name not in init_kwargs:
init_kwargs[args_name] = file_path
......@@ -438,8 +465,10 @@ class PreTrainedTokenizer(object):
try:
tokenizer = cls(*init_inputs, **init_kwargs)
except OSError:
OSError("Unable to load vocabulary from file. "
"Please check that the provided vocabulary is accessible and not corrupted.")
OSError(
"Unable to load vocabulary from file. "
"Please check that the provided vocabulary is accessible and not corrupted."
)
# Save inputs and kwargs for saving and re-loading with ``save_pretrained``
tokenizer.init_inputs = init_inputs
......@@ -449,13 +478,12 @@ class PreTrainedTokenizer(object):
if added_tokens_file is not None:
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
added_tok_encoder = json.load(added_tokens_handle)
added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
tokenizer.added_tokens_encoder.update(added_tok_encoder)
tokenizer.added_tokens_decoder.update(added_tok_decoder)
return tokenizer
def save_pretrained(self, save_directory):
""" Save the tokenizer vocabulary files together with:
- added tokens,
......@@ -476,28 +504,27 @@ class PreTrainedTokenizer(object):
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
tokenizer_config = copy.deepcopy(self.init_kwargs)
tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs)
tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
for file_id in self.vocab_files_names.keys():
tokenizer_config.pop(file_id, None)
with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
f.write(json.dumps(tokenizer_config, ensure_ascii=False))
with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
with open(special_tokens_map_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
with open(added_tokens_file, 'w', encoding='utf-8') as f:
with open(added_tokens_file, "w", encoding="utf-8") as f:
if self.added_tokens_encoder:
out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
else:
out_str = u"{}"
out_str = "{}"
f.write(out_str)
vocab_files = self.save_vocabulary(save_directory)
return vocab_files + (special_tokens_map_file, added_tokens_file)
def save_vocabulary(self, save_directory):
""" Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
and special token mappings.
......@@ -506,17 +533,14 @@ class PreTrainedTokenizer(object):
"""
raise NotImplementedError
def vocab_size(self):
""" Size of the base vocabulary (without the added tokens) """
raise NotImplementedError
def __len__(self):
""" Size of the full vocabulary with the added tokens """
return self.vocab_size + len(self.added_tokens_encoder)
def add_tokens(self, new_tokens):
"""
Add a list of new tokens to the tokenizer class. If the new tokens are not in the
......@@ -543,17 +567,19 @@ class PreTrainedTokenizer(object):
to_add_tokens = []
for token in new_tokens:
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
if self.init_kwargs.get('do_lower_case', False) and token not in self.all_special_tokens:
assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) # noqa: F821
if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
token = token.lower()
if token != self.unk_token and \
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
token not in to_add_tokens:
if (
token != self.unk_token
and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
and token not in to_add_tokens
):
to_add_tokens.append(token)
logger.info("Adding %s to the vocabulary", token)
added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
self.added_tokens_encoder.update(added_tok_encoder)
self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens))
self.added_tokens_decoder.update(added_tok_decoder)
......@@ -622,18 +648,19 @@ class PreTrainedTokenizer(object):
added_tokens = 0
for key, value in special_tokens_dict.items():
assert key in self.SPECIAL_TOKENS_ATTRIBUTES
if key == 'additional_special_tokens':
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
if key == "additional_special_tokens":
assert isinstance(value, (list, tuple)) and all(
isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value # noqa: F821
)
added_tokens += self.add_tokens(value)
else:
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) # noqa: F821
added_tokens += self.add_tokens([value])
logger.info("Assigning %s to the %s key of the tokenizer", value, key)
setattr(self, key, value)
return added_tokens
def tokenize(self, text, **kwargs):
""" Converts a string in a sequence of tokens (string), using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based
......@@ -649,14 +676,10 @@ class PreTrainedTokenizer(object):
def lowercase_text(t):
# convert non-special tokens to lowercase
escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
pattern = r'(' + r'|'.join(escaped_special_toks) + r')|' + \
r'(.+?)'
return re.sub(
pattern,
lambda m: m.groups()[0] or m.groups()[1].lower(),
t)
if self.init_kwargs.get('do_lower_case', False):
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t)
if self.init_kwargs.get("do_lower_case", False):
text = lowercase_text(text)
def split_on_token(tok, text):
......@@ -694,9 +717,14 @@ class PreTrainedTokenizer(object):
tokenized_text += [sub_text]
text_list = tokenized_text
return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) \
if token not in self.unique_added_tokens_encoder
else [token] for token in tokenized_text)))
return list(
itertools.chain.from_iterable(
(
self._tokenize(token, **kwargs) if token not in self.unique_added_tokens_encoder else [token]
for token in tokenized_text
)
)
)
added_tokens = self.unique_added_tokens_encoder
tokenized_text = split_on_tokens(added_tokens, text)
......@@ -718,7 +746,7 @@ class PreTrainedTokenizer(object):
if tokens is None:
return None
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): # noqa: F821
return self._convert_token_to_id_with_added_voc(tokens)
ids = []
......@@ -737,16 +765,18 @@ class PreTrainedTokenizer(object):
def _convert_token_to_id(self, token):
raise NotImplementedError
def encode(self,
def encode(
self,
text,
text_pair=None,
add_special_tokens=True,
max_length=None,
stride=0,
truncation_strategy='longest_first',
truncation_strategy="longest_first",
pad_to_max_length=False,
return_tensors=None,
**kwargs):
**kwargs
):
"""
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
......@@ -781,7 +811,8 @@ class PreTrainedTokenizer(object):
or PyTorch torch.Tensor instead of a list of python integers.
**kwargs: passed to the `self.tokenize()` method
"""
encoded_inputs = self.encode_plus(text,
encoded_inputs = self.encode_plus(
text,
text_pair=text_pair,
max_length=max_length,
add_special_tokens=add_special_tokens,
......@@ -789,24 +820,27 @@ class PreTrainedTokenizer(object):
truncation_strategy=truncation_strategy,
pad_to_max_length=pad_to_max_length,
return_tensors=return_tensors,
**kwargs)
**kwargs
)
return encoded_inputs["input_ids"]
def encode_plus(self,
def encode_plus(
self,
text,
text_pair=None,
add_special_tokens=True,
max_length=None,
stride=0,
truncation_strategy='longest_first',
truncation_strategy="longest_first",
pad_to_max_length=False,
return_tensors=None,
return_token_type_ids=True,
return_attention_mask=True,
return_overflowing_tokens=False,
return_special_tokens_mask=False,
**kwargs):
**kwargs
):
"""
Returns a dictionary containing the encoded sequence or sequence pair and additional informations:
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
......@@ -874,12 +908,15 @@ class PreTrainedTokenizer(object):
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
return text
else:
raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
raise ValueError(
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
)
first_ids = get_input_ids(text)
second_ids = get_input_ids(text_pair) if text_pair is not None else None
return self.prepare_for_model(first_ids,
return self.prepare_for_model(
first_ids,
pair_ids=second_ids,
max_length=max_length,
pad_to_max_length=pad_to_max_length,
......@@ -890,18 +927,21 @@ class PreTrainedTokenizer(object):
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask)
return_special_tokens_mask=return_special_tokens_mask,
)
def batch_encode_plus(self,
def batch_encode_plus(
self,
batch_text_or_text_pairs=None,
add_special_tokens=False,
max_length=None,
stride=0,
truncation_strategy='longest_first',
truncation_strategy="longest_first",
return_tensors=None,
return_input_lengths=False,
return_attention_masks=False,
**kwargs):
**kwargs
):
"""
Returns a dictionary containing the encoded sequence or sequence pair and additional information:
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
......@@ -933,12 +973,19 @@ class PreTrainedTokenizer(object):
ids, pair_ids = ids_or_pair_ids
else:
ids, pair_ids = ids_or_pair_ids, None
outputs = self.encode_plus(ids, pair_ids, add_special_tokens=add_special_tokens, max_length=max_length,
stride=stride, truncation_strategy=truncation_strategy, return_tensors=None)
outputs = self.encode_plus(
ids,
pair_ids,
add_special_tokens=add_special_tokens,
max_length=max_length,
stride=stride,
truncation_strategy=truncation_strategy,
return_tensors=None,
)
# Append the non-padded length to the output
if return_input_lengths:
outputs['input_len'] = len(outputs['input_ids'])
outputs["input_len"] = len(outputs["input_ids"])
for key, value in outputs.items():
if key not in batch_outputs:
......@@ -946,11 +993,11 @@ class PreTrainedTokenizer(object):
batch_outputs[key].append(value)
# Compute longest sequence size
max_seq_len = max(map(len, batch_outputs['input_ids']))
max_seq_len = max(map(len, batch_outputs["input_ids"]))
if return_attention_masks:
# Allow the model to not give any special attention to padded input
batch_outputs['attention_mask'] = [[0] * len(v) for v in batch_outputs['input_ids']]
batch_outputs["attention_mask"] = [[0] * len(v) for v in batch_outputs["input_ids"]]
if return_tensors is not None:
......@@ -958,34 +1005,48 @@ class PreTrainedTokenizer(object):
for key, value in batch_outputs.items():
padded_value = value
if key != 'input_len':
if key != "input_len":
# Padding handle
padded_value = [v + [self.pad_token_id if key == 'input_ids' else 1] * (max_seq_len - len(v)) for v in padded_value]
padded_value = [
v + [self.pad_token_id if key == "input_ids" else 1] * (max_seq_len - len(v))
for v in padded_value
]
if return_tensors == 'tf' and is_tf_available():
if return_tensors == "tf" and is_tf_available():
batch_outputs[key] = tf.constant(padded_value)
elif return_tensors == 'pt' and is_torch_available():
elif return_tensors == "pt" and is_torch_available():
batch_outputs[key] = torch.tensor(padded_value)
elif return_tensors is not None:
logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
logger.warning(
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
return_tensors
)
)
# encoder_attention_mask requires 1 for real token, 0 for padding, just invert value
if return_attention_masks:
if is_tf_available():
batch_outputs['attention_mask'] = tf.abs(batch_outputs['attention_mask'] - 1)
batch_outputs["attention_mask"] = tf.abs(batch_outputs["attention_mask"] - 1)
else:
batch_outputs['attention_mask'] = torch.abs(batch_outputs['attention_mask'] - 1)
batch_outputs["attention_mask"] = torch.abs(batch_outputs["attention_mask"] - 1)
return batch_outputs
def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
truncation_strategy='longest_first',
def prepare_for_model(
self,
ids,
pair_ids=None,
max_length=None,
add_special_tokens=True,
stride=0,
truncation_strategy="longest_first",
pad_to_max_length=False,
return_tensors=None,
return_token_type_ids=True,
return_attention_mask=True,
return_overflowing_tokens=False,
return_special_tokens_mask=False):
return_special_tokens_mask=False,
):
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
It adds special tokens, truncates
......@@ -1050,10 +1111,13 @@ class PreTrainedTokenizer(object):
# Handle max sequence length
total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
if max_length and total_len > max_length:
ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids,
num_tokens_to_remove=total_len-max_length,
ids, pair_ids, overflowing_tokens = self.truncate_sequences(
ids,
pair_ids=pair_ids,
num_tokens_to_remove=total_len - max_length,
truncation_strategy=truncation_strategy,
stride=stride)
stride=stride,
)
if return_overflowing_tokens:
encoded_inputs["overflowing_tokens"] = overflowing_tokens
encoded_inputs["num_truncated_tokens"] = total_len - max_length
......@@ -1081,35 +1145,45 @@ class PreTrainedTokenizer(object):
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
logger.warning(
"Token indices sequence length is longer than the specified maximum sequence length "
"for this model ({} > {}). Running this sequence through the model will result in "
"indexing errors".format(len(ids), self.max_len))
"indexing errors".format(len(ids), self.max_len)
)
needs_to_be_padded = pad_to_max_length and (
max_length and len(encoded_inputs["input_ids"]) < max_length
or
max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000
max_length
and len(encoded_inputs["input_ids"]) < max_length
or max_length is None
and len(encoded_inputs["input_ids"]) < self.max_len
and self.max_len <= 10000
)
if pad_to_max_length and max_length is None and self.max_len > 10000:
logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.")
logger.warning(
"Sequence can't be padded as no maximum length is specified and the model maximum length is too high."
)
if needs_to_be_padded:
difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
if self.padding_side == 'right':
if self.padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
if return_token_type_ids:
encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
encoded_inputs["token_type_ids"] = (
encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
)
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
elif self.padding_side == 'left':
elif self.padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
if return_token_type_ids:
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
"token_type_ids"
]
if return_special_tokens_mask:
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
......@@ -1121,14 +1195,14 @@ class PreTrainedTokenizer(object):
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
# Prepare inputs as tensors if asked
if return_tensors == 'tf' and is_tf_available():
if return_tensors == "tf" and is_tf_available():
encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
if "attention_mask" in encoded_inputs:
encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
elif return_tensors == 'pt' and is_torch_available():
elif return_tensors == "pt" and is_torch_available():
encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
......@@ -1137,11 +1211,15 @@ class PreTrainedTokenizer(object):
elif return_tensors is not None:
logger.warning(
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
return_tensors))
return_tensors
)
)
return encoded_inputs
def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
def truncate_sequences(
self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0
):
"""Truncates a sequence pair in place to the maximum length.
truncation_strategy: string selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
......@@ -1154,7 +1232,7 @@ class PreTrainedTokenizer(object):
if num_tokens_to_remove <= 0:
return ids, pair_ids, []
if truncation_strategy == 'longest_first':
if truncation_strategy == "longest_first":
overflowing_tokens = []
for _ in range(num_tokens_to_remove):
if pair_ids is None or len(ids) > len(pair_ids):
......@@ -1165,20 +1243,22 @@ class PreTrainedTokenizer(object):
window_len = min(len(ids), stride)
if window_len > 0:
overflowing_tokens = ids[-window_len:] + overflowing_tokens
elif truncation_strategy == 'only_first':
elif truncation_strategy == "only_first":
assert len(ids) > num_tokens_to_remove
window_len = min(len(ids), stride + num_tokens_to_remove)
overflowing_tokens = ids[-window_len:]
ids = ids[:-num_tokens_to_remove]
elif truncation_strategy == 'only_second':
elif truncation_strategy == "only_second":
assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
window_len = min(len(pair_ids), stride + num_tokens_to_remove)
overflowing_tokens = pair_ids[-window_len:]
pair_ids = pair_ids[:-num_tokens_to_remove]
elif truncation_strategy == 'do_not_truncate':
elif truncation_strategy == "do_not_truncate":
raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
else:
raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']")
raise ValueError(
"Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
)
return (ids, pair_ids, overflowing_tokens)
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
......@@ -1246,7 +1326,7 @@ class PreTrainedTokenizer(object):
The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
but we often want to remove sub-word tokenization artifacts at the same time.
"""
return ' '.join(self.convert_ids_to_tokens(tokens))
return " ".join(self.convert_ids_to_tokens(tokens))
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
"""
......@@ -1278,7 +1358,7 @@ class PreTrainedTokenizer(object):
current_sub_text.append(token)
if current_sub_text:
sub_texts.append(self.convert_tokens_to_string(current_sub_text))
text = ' '.join(sub_texts)
text = " ".join(sub_texts)
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
......@@ -1323,7 +1403,17 @@ class PreTrainedTokenizer(object):
def clean_up_tokenization(out_string):
""" Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
"""
out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
out_string = (
out_string.replace(" .", ".")
.replace(" ?", "?")
.replace(" !", "!")
.replace(" ,", ",")
.replace(" ' ", "'")
.replace(" n't", "n't")
.replace(" 'm", "'m")
.replace(" do not", " don't")
.replace(" 's", "'s")
.replace(" 've", "'ve")
.replace(" 're", "'re")
)
return out_string
......@@ -13,8 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for XLM."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
......@@ -27,76 +26,76 @@ from io import open
import sacremoses as sm
from .tokenization_utils import PreTrainedTokenizer
from .tokenization_bert import BasicTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {
'vocab_file': 'vocab.json',
'merges_file': 'merges.txt',
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json",
'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json",
'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json",
'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json",
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
"vocab_file": {
"xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json",
"xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json",
"xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json",
"xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json",
"xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json",
"xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json",
"xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json",
"xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json",
"xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json",
"xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json",
},
'merges_file':
{
'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt",
'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt",
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
"merges_file": {
"xlm-mlm-en-2048": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt",
"xlm-mlm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
"xlm-mlm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
"xlm-mlm-enro-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt",
"xlm-mlm-tlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt",
"xlm-mlm-xnli15-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt",
"xlm-clm-enfr-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt",
"xlm-clm-ende-1024": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt",
"xlm-mlm-17-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt",
"xlm-mlm-100-1280": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'xlm-mlm-en-2048': 512,
'xlm-mlm-ende-1024': 512,
'xlm-mlm-enfr-1024': 512,
'xlm-mlm-enro-1024': 512,
'xlm-mlm-tlm-xnli15-1024': 512,
'xlm-mlm-xnli15-1024': 512,
'xlm-clm-enfr-1024': 512,
'xlm-clm-ende-1024': 512,
'xlm-mlm-17-1280': 512,
'xlm-mlm-100-1280': 512,
"xlm-mlm-en-2048": 512,
"xlm-mlm-ende-1024": 512,
"xlm-mlm-enfr-1024": 512,
"xlm-mlm-enro-1024": 512,
"xlm-mlm-tlm-xnli15-1024": 512,
"xlm-mlm-xnli15-1024": 512,
"xlm-clm-enfr-1024": 512,
"xlm-clm-ende-1024": 512,
"xlm-mlm-17-1280": 512,
"xlm-mlm-100-1280": 512,
}
PRETRAINED_INIT_CONFIGURATION = {
'xlm-mlm-en-2048': {"do_lowercase_and_remove_accent": True},
'xlm-mlm-ende-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "de",
"1": "en"},
"lang2id": { "de": 0,
"en": 1 }},
'xlm-mlm-enfr-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "en",
"1": "fr"},
"lang2id": { "en": 0,
"fr": 1 }},
'xlm-mlm-enro-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "en",
"1": "ro"},
"lang2id": { "en": 0,
"ro": 1 }},
'xlm-mlm-tlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "ar",
"xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True},
"xlm-mlm-ende-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {"0": "de", "1": "en"},
"lang2id": {"de": 0, "en": 1},
},
"xlm-mlm-enfr-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {"0": "en", "1": "fr"},
"lang2id": {"en": 0, "fr": 1},
},
"xlm-mlm-enro-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {"0": "en", "1": "ro"},
"lang2id": {"en": 0, "ro": 1},
},
"xlm-mlm-tlm-xnli15-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {
"0": "ar",
"1": "bg",
"2": "de",
"3": "el",
......@@ -110,8 +109,10 @@ PRETRAINED_INIT_CONFIGURATION = {
"11": "tr",
"12": "ur",
"13": "vi",
"14": "zh"},
"lang2id": { "ar": 0,
"14": "zh",
},
"lang2id": {
"ar": 0,
"bg": 1,
"de": 2,
"el": 3,
......@@ -125,9 +126,13 @@ PRETRAINED_INIT_CONFIGURATION = {
"tr": 11,
"ur": 12,
"vi": 13,
"zh": 14 }},
'xlm-mlm-xnli15-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "ar",
"zh": 14,
},
},
"xlm-mlm-xnli15-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {
"0": "ar",
"1": "bg",
"2": "de",
"3": "el",
......@@ -141,8 +146,10 @@ PRETRAINED_INIT_CONFIGURATION = {
"11": "tr",
"12": "ur",
"13": "vi",
"14": "zh"},
"lang2id": { "ar": 0,
"14": "zh",
},
"lang2id": {
"ar": 0,
"bg": 1,
"de": 2,
"el": 3,
......@@ -156,18 +163,21 @@ PRETRAINED_INIT_CONFIGURATION = {
"tr": 11,
"ur": 12,
"vi": 13,
"zh": 14 }},
'xlm-clm-enfr-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "en",
"1": "fr"},
"lang2id": { "en": 0,
"fr": 1 }},
'xlm-clm-ende-1024': { "do_lowercase_and_remove_accent": True,
"id2lang": { "0": "de",
"1": "en"},
"lang2id": { "de": 0,
"en": 1 }},
'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False,
"zh": 14,
},
},
"xlm-clm-enfr-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {"0": "en", "1": "fr"},
"lang2id": {"en": 0, "fr": 1},
},
"xlm-clm-ende-1024": {
"do_lowercase_and_remove_accent": True,
"id2lang": {"0": "de", "1": "en"},
"lang2id": {"de": 0, "en": 1},
},
"xlm-mlm-17-1280": {
"do_lowercase_and_remove_accent": False,
"id2lang": {
"0": "ar",
"1": "de",
......@@ -185,7 +195,7 @@ PRETRAINED_INIT_CONFIGURATION = {
"13": "sv",
"14": "tr",
"15": "vi",
"16": "zh"
"16": "zh",
},
"lang2id": {
"ar": 0,
......@@ -204,8 +214,11 @@ PRETRAINED_INIT_CONFIGURATION = {
"sv": 13,
"tr": 14,
"vi": 15,
"zh": 16}},
'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False,
"zh": 16,
},
},
"xlm-mlm-100-1280": {
"do_lowercase_and_remove_accent": False,
"id2lang": {
"0": "af",
"1": "als",
......@@ -306,7 +319,7 @@ PRETRAINED_INIT_CONFIGURATION = {
"96": "zh",
"97": "zh_classical",
"98": "zh_min_nan",
"99": "zh_yue"
"99": "zh_yue",
},
"lang2id": {
"af": 0,
......@@ -408,10 +421,12 @@ PRETRAINED_INIT_CONFIGURATION = {
"zh": 96,
"zh_classical": 97,
"zh_min_nan": 98,
"zh_yue": 99
}},
"zh_yue": 99,
},
},
}
def get_pairs(word):
"""
Return set of symbol pairs in a word.
......@@ -430,7 +445,7 @@ def lowercase_and_remove_accent(text):
Lowercase and strips accents from a piece of text based on
https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
"""
text = ' '.join(text)
text = " ".join(text)
text = text.lower()
text = unicodedata.normalize("NFD", text)
output = []
......@@ -439,73 +454,73 @@ def lowercase_and_remove_accent(text):
if cat == "Mn":
continue
output.append(char)
return "".join(output).lower().split(' ')
return "".join(output).lower().split(" ")
def replace_unicode_punct(text):
'''
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
'''
text = text.replace(',', ',')
text = re.sub(r'。\s*', '. ', text)
text = text.replace('、', ',')
text = text.replace('”', '"')
text = text.replace('“', '"')
text = text.replace('∶', ':')
text = text.replace(':', ':')
text = text.replace('?', '?')
text = text.replace('《', '"')
text = text.replace('》', '"')
text = text.replace(')', ')')
text = text.replace('!', '!')
text = text.replace('(', '(')
text = text.replace(';', ';')
text = text.replace('1', '"')
text = text.replace('」', '"')
text = text.replace('「', '"')
text = text.replace('0', '0')
text = text.replace('3', '3')
text = text.replace('2', '2')
text = text.replace('5', '5')
text = text.replace('6', '6')
text = text.replace('9', '9')
text = text.replace('7', '7')
text = text.replace('8', '8')
text = text.replace('4', '4')
text = re.sub(r'.\s*', '. ', text)
text = text.replace('~', '~')
text = text.replace('’', '\'')
text = text.replace('…', '...')
text = text.replace('━', '-')
text = text.replace('〈', '<')
text = text.replace('〉', '>')
text = text.replace('【', '[')
text = text.replace('】', ']')
text = text.replace('%', '%')
"""
text = text.replace(",", ",")
text = re.sub(r"。\s*", ". ", text)
text = text.replace("、", ",")
text = text.replace("”", '"')
text = text.replace("“", '"')
text = text.replace("∶", ":")
text = text.replace(":", ":")
text = text.replace("?", "?")
text = text.replace("《", '"')
text = text.replace("》", '"')
text = text.replace(")", ")")
text = text.replace("!", "!")
text = text.replace("(", "(")
text = text.replace(";", ";")
text = text.replace("1", '"')
text = text.replace("」", '"')
text = text.replace("「", '"')
text = text.replace("0", "0")
text = text.replace("3", "3")
text = text.replace("2", "2")
text = text.replace("5", "5")
text = text.replace("6", "6")
text = text.replace("9", "9")
text = text.replace("7", "7")
text = text.replace("8", "8")
text = text.replace("4", "4")
text = re.sub(r".\s*", ". ", text)
text = text.replace("~", "~")
text = text.replace("’", "'")
text = text.replace("…", "...")
text = text.replace("━", "-")
text = text.replace("〈", "<")
text = text.replace("〉", ">")
text = text.replace("【", "[")
text = text.replace("】", "]")
text = text.replace("%", "%")
return text
def remove_non_printing_char(text):
'''
"""
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
'''
"""
output = []
for char in text:
cat = unicodedata.category(char)
if cat.startswith('C'):
if cat.startswith("C"):
continue
output.append(char)
return "".join(output)
def romanian_preprocessing(text):
'''Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`'''
"""Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`"""
# https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
# https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
text = text.replace("\u0218", "S").replace("\u0219", "s") #s-comma
text = text.replace("\u021a", "T").replace("\u021b", "t") #t-comma
text = text.replace("\u0218", "S").replace("\u0219", "s") # s-comma
text = text.replace("\u021a", "T").replace("\u021b", "t") # t-comma
text = text.replace("\u0102", "A").replace("\u0103", "a")
text = text.replace("\u00C2", "A").replace("\u00E2", "a")
text = text.replace("\u00CE", "I").replace("\u00EE", "i")
......@@ -531,24 +546,49 @@ class XLMTokenizer(PreTrainedTokenizer):
- `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies)
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>",
sep_token="</s>", pad_token="<pad>", cls_token="</s>",
mask_token="<special1>", additional_special_tokens=["<special0>",
"<special1>", "<special2>", "<special3>", "<special4>", "<special5>",
"<special6>", "<special7>", "<special8>", "<special9>"],
lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True,
**kwargs):
super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token,
sep_token=sep_token, pad_token=pad_token,
cls_token=cls_token, mask_token=mask_token,
def __init__(
self,
vocab_file,
merges_file,
unk_token="<unk>",
bos_token="<s>",
sep_token="</s>",
pad_token="<pad>",
cls_token="</s>",
mask_token="<special1>",
additional_special_tokens=[
"<special0>",
"<special1>",
"<special2>",
"<special3>",
"<special4>",
"<special5>",
"<special6>",
"<special7>",
"<special8>",
"<special9>",
],
lang2id=None,
id2lang=None,
do_lowercase_and_remove_accent=True,
**kwargs
):
super(XLMTokenizer, self).__init__(
unk_token=unk_token,
bos_token=bos_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
additional_special_tokens=additional_special_tokens,
**kwargs)
**kwargs
)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
......@@ -557,7 +597,7 @@ class XLMTokenizer(PreTrainedTokenizer):
self.cache_moses_punct_normalizer = dict()
# cache of sm.MosesTokenizer instance
self.cache_moses_tokenizer = dict()
self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja'])
self.lang_with_custom_tokenizer = set(["zh", "th", "ja"])
# True for current supported model (v1.2.0), False for XLM-17 & 100
self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
self.lang2id = lang2id
......@@ -570,9 +610,9 @@ class XLMTokenizer(PreTrainedTokenizer):
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
self.decoder = {v:k for k,v in self.encoder.items()}
with open(merges_file, encoding='utf-8') as merges_handle:
merges = merges_handle.read().split('\n')[:-1]
self.decoder = {v: k for k, v in self.encoder.items()}
with open(merges_file, encoding="utf-8") as merges_handle:
merges = merges_handle.read().split("\n")[:-1]
merges = [tuple(merge.split()[:2]) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
......@@ -603,9 +643,14 @@ class XLMTokenizer(PreTrainedTokenizer):
if self.ja_word_tokenizer is None:
try:
import Mykytea
self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~'))
self.ja_word_tokenizer = Mykytea.Mykytea(
"-model %s/local/share/kytea/model.bin" % os.path.expanduser("~")
)
except (AttributeError, ImportError) as e:
logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps")
logger.error(
"Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps"
)
logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
logger.error("2. autoreconf -i")
logger.error("3. ./configure --prefix=$HOME/local")
......@@ -619,16 +664,16 @@ class XLMTokenizer(PreTrainedTokenizer):
return len(self.encoder)
def bpe(self, token):
word = tuple(token[:-1]) + (token[-1] + '</w>',)
word = tuple(token[:-1]) + (token[-1] + "</w>",)
if token in self.cache:
return self.cache[token]
pairs = get_pairs(word)
if not pairs:
return token+'</w>'
return token + "</w>"
while True:
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
if bigram not in self.bpe_ranks:
break
first, second = bigram
......@@ -637,14 +682,15 @@ class XLMTokenizer(PreTrainedTokenizer):
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
......@@ -655,13 +701,13 @@ class XLMTokenizer(PreTrainedTokenizer):
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
if word == '\n </w>':
word = '\n</w>'
word = " ".join(word)
if word == "\n </w>":
word = "\n</w>"
self.cache[token] = word
return word
def _tokenize(self, text, lang='en', bypass_tokenizer=False):
def _tokenize(self, text, lang="en", bypass_tokenizer=False):
"""
Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses.
......@@ -679,10 +725,10 @@ class XLMTokenizer(PreTrainedTokenizer):
make && make install
pip install kytea
```
- [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer *
- [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*)
- Install with `pip install jieba`
\* The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
(*) The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip).
However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated.
Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine
if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
......@@ -697,45 +743,49 @@ class XLMTokenizer(PreTrainedTokenizer):
List of tokens.
"""
if lang and self.lang2id and lang not in self.lang2id:
logger.error("Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.")
logger.error(
"Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
)
if bypass_tokenizer:
text = text.split()
elif lang not in self.lang_with_custom_tokenizer:
text = self.moses_pipeline(text, lang=lang)
# TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
if lang == 'ro':
if lang == "ro":
text = romanian_preprocessing(text)
text = self.moses_tokenize(text, lang=lang)
elif lang == 'th':
elif lang == "th":
text = self.moses_pipeline(text, lang=lang)
try:
if 'pythainlp' not in sys.modules:
if "pythainlp" not in sys.modules:
from pythainlp.tokenize import word_tokenize as th_word_tokenize
else:
th_word_tokenize = sys.modules['pythainlp'].word_tokenize
th_word_tokenize = sys.modules["pythainlp"].word_tokenize
except (AttributeError, ImportError) as e:
logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps")
logger.error(
"Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps"
)
logger.error("1. pip install pythainlp")
raise e
text = th_word_tokenize(text)
elif lang == 'zh':
elif lang == "zh":
try:
if 'jieba' not in sys.modules:
if "jieba" not in sys.modules:
import jieba
else:
jieba = sys.modules['jieba']
jieba = sys.modules["jieba"]
except (AttributeError, ImportError) as e:
logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
logger.error("1. pip install jieba")
raise e
text = ' '.join(jieba.cut(text))
text = " ".join(jieba.cut(text))
text = self.moses_pipeline(text, lang=lang)
text = text.split()
elif lang == 'ja':
elif lang == "ja":
text = self.moses_pipeline(text, lang=lang)
text = self.ja_tokenize(text)
else:
raise ValueError('It should not reach here')
raise ValueError("It should not reach here")
if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
text = lowercase_and_remove_accent(text)
......@@ -743,7 +793,7 @@ class XLMTokenizer(PreTrainedTokenizer):
split_tokens = []
for token in text:
if token:
split_tokens.extend([t for t in self.bpe(token).split(' ')])
split_tokens.extend([t for t in self.bpe(token).split(" ")])
return split_tokens
......@@ -757,7 +807,7 @@ class XLMTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
out_string = ''.join(tokens).replace('</w>', ' ').strip()
out_string = "".join(tokens).replace("</w>", " ").strip()
return out_string
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
......@@ -792,8 +842,10 @@ class XLMTokenizer(PreTrainedTokenizer):
if already_has_special_tokens:
if token_ids_1 is not None:
raise ValueError("You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model.")
raise ValueError(
"You should not supply a second sequence if the provided sequence of "
"ids is already formated with special tokens for the model."
)
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
if token_ids_1 is not None:
......@@ -820,20 +872,22 @@ class XLMTokenizer(PreTrainedTokenizer):
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES["merges_file"])
with open(vocab_file, 'w', encoding='utf-8') as f:
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file))
logger.warning(
"Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file)
)
index = token_index
writer.write(' '.join(bpe_tokens) + u'\n')
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment