"examples/vscode:/vscode.git/clone" did not exist on "e67fea2c7e1f710d77a1a1b36da01b541c312c6c"
Unverified Commit 0a2fecdf authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge branch 'master' into master

parents 39eb31e1 e0caab0c
...@@ -20,13 +20,17 @@ import json ...@@ -20,13 +20,17 @@ import json
from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory from .tokenization_tests_commons import CommonTestCases
class OpenAIGPTTokenizationTest(unittest.TestCase): class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
def test_full_tokenizer(self): tokenizer_class = OpenAIGPTTokenizer
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
def setUp(self):
super(OpenAIGPTTokenizationTest, self).setUp()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
"w</w>", "r</w>", "t</w>", "w</w>", "r</w>", "t</w>",
"lo", "low", "er</w>", "lo", "low", "er</w>",
...@@ -34,30 +38,34 @@ class OpenAIGPTTokenizationTest(unittest.TestCase): ...@@ -34,30 +38,34 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""] merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
with TemporaryDirectory() as tmpdirname: self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file']) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file']) with open(self.vocab_file, "w") as fp:
with open(vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
fp.write(json.dumps(vocab_tokens)) with open(self.merges_file, "w") as fp:
with open(merges_file, "w") as fp: fp.write("\n".join(merges))
fp.write("\n".join(merges))
input_text = u"lower newer" def get_tokenizer(self):
output_text = u"lower newer" return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname)
create_and_check_tokenizer_commons(self, input_text, output_text, OpenAIGPTTokenizer, tmpdirname) def get_input_output_texts(self):
input_text = u"lower newer"
output_text = u"lower newer"
return input_text, output_text
tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file)
text = "lower" def test_full_tokenizer(self):
bpe_tokens = ["low", "er</w>"] tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens) text = "lower"
bpe_tokens = ["low", "er</w>"]
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + ["<unk>"] input_tokens = tokens + ["<unk>"]
input_bpe_tokens = [14, 15, 20] input_bpe_tokens = [14, 15, 20]
self.assertListEqual( self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
if __name__ == '__main__': if __name__ == '__main__':
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import json
import unittest
from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
from .tokenization_tests_commons import CommonTestCases
class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer_class = RobertaTokenizer
def setUp(self):
super(RobertaTokenizationTest, self).setUp()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
"lo", "low", "er",
"low", "lowest", "newer", "wider", "<unk>"]
vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
with open(self.vocab_file, "w") as fp:
fp.write(json.dumps(vocab_tokens))
with open(self.merges_file, "w") as fp:
fp.write("\n".join(merges))
def get_tokenizer(self):
return RobertaTokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
def get_input_output_texts(self):
input_text = u"lower newer"
output_text = u"lower<unk>newer"
return input_text, output_text
def test_full_tokenizer(self):
tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower"
bpe_tokens = ["low", "er"]
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [13, 12, 17]
self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
def roberta_dict_integration_testing(self):
tokenizer = self.get_tokenizer()
self.assertListEqual(
tokenizer.encode('Hello world!'),
[0, 31414, 232, 328, 2]
)
self.assertListEqual(
tokenizer.encode('Hello world! cécé herlolip 418'),
[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
)
def test_sequence_builders(self):
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build")
encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
assert encoded_sentence == encoded_text_from_decode
assert encoded_pair == encoded_pair_from_decode
if __name__ == '__main__':
unittest.main()
...@@ -19,6 +19,7 @@ import sys ...@@ -19,6 +19,7 @@ import sys
from io import open from io import open
import tempfile import tempfile
import shutil import shutil
import unittest
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
import cPickle as pickle import cPickle as pickle
...@@ -36,113 +37,124 @@ else: ...@@ -36,113 +37,124 @@ else:
unicode = str unicode = str
def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs): class CommonTestCases:
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running") class CommonTokenizerTester(unittest.TestCase):
with TemporaryDirectory() as tmpdirname: tokenizer_class = None
tokenizer.save_pretrained(tmpdirname)
tokenizer = tokenizer.from_pretrained(tmpdirname)
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running") def setUp(self):
tester.assertListEqual(before_tokens, after_tokens) self.tmpdirname = tempfile.mkdtemp()
def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs): def tearDown(self):
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs) shutil.rmtree(self.tmpdirname)
tester.assertIsNotNone(tokenizer)
text = u"Munich and Berlin are nice cities" def get_tokenizer(self):
subwords = tokenizer.tokenize(text) raise NotImplementedError
with TemporaryDirectory() as tmpdirname: def get_input_output_texts(self):
raise NotImplementedError
filename = os.path.join(tmpdirname, u"tokenizer.bin") def test_save_and_load_tokenizer(self):
pickle.dump(tokenizer, open(filename, "wb")) tokenizer = self.get_tokenizer()
tokenizer_new = pickle.load(open(filename, "rb")) before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
subwords_loaded = tokenizer_new.tokenize(text) with TemporaryDirectory() as tmpdirname:
tokenizer.save_pretrained(tmpdirname)
tokenizer = tokenizer.from_pretrained(tmpdirname)
tester.assertListEqual(subwords, subwords_loaded) after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
self.assertListEqual(before_tokens, after_tokens)
def test_pickle_tokenizer(self):
tokenizer = self.get_tokenizer()
self.assertIsNotNone(tokenizer)
def create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs): text = u"Munich and Berlin are nice cities"
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs) subwords = tokenizer.tokenize(text)
vocab_size = tokenizer.vocab_size with TemporaryDirectory() as tmpdirname:
all_size = len(tokenizer)
tester.assertNotEqual(vocab_size, 0) filename = os.path.join(tmpdirname, u"tokenizer.bin")
tester.assertEqual(vocab_size, all_size) pickle.dump(tokenizer, open(filename, "wb"))
new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"] tokenizer_new = pickle.load(open(filename, "rb"))
added_toks = tokenizer.add_tokens(new_toks)
vocab_size_2 = tokenizer.vocab_size
all_size_2 = len(tokenizer)
tester.assertNotEqual(vocab_size_2, 0) subwords_loaded = tokenizer_new.tokenize(text)
tester.assertEqual(vocab_size, vocab_size_2)
tester.assertEqual(added_toks, len(new_toks))
tester.assertEqual(all_size_2, all_size + len(new_toks))
tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l") self.assertListEqual(subwords, subwords_loaded)
tester.assertGreaterEqual(len(tokens), 4)
tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
'pad_token': "<<<<<|||>|>>>>|>"}
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer)
tester.assertNotEqual(vocab_size_3, 0) def test_add_tokens_tokenizer(self):
tester.assertEqual(vocab_size, vocab_size_3) tokenizer = self.get_tokenizer()
tester.assertEqual(added_toks_2, len(new_toks_2))
tester.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l") vocab_size = tokenizer.vocab_size
all_size = len(tokenizer)
tester.assertGreaterEqual(len(tokens), 6) self.assertNotEqual(vocab_size, 0)
tester.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertEqual(vocab_size, all_size)
tester.assertGreater(tokens[0], tokens[1])
tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
tester.assertGreater(tokens[-2], tokens[-3])
tester.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
tester.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
added_toks = tokenizer.add_tokens(new_toks)
vocab_size_2 = tokenizer.vocab_size
all_size_2 = len(tokenizer)
def create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs): self.assertNotEqual(vocab_size_2, 0)
tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs) self.assertEqual(vocab_size, vocab_size_2)
self.assertEqual(added_toks, len(new_toks))
self.assertEqual(all_size_2, all_size + len(new_toks))
tokens = tokenizer.tokenize(input_text) tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
ids = tokenizer.convert_tokens_to_ids(tokens) self.assertGreaterEqual(len(tokens), 4)
ids_2 = tokenizer.encode(input_text) self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
tester.assertListEqual(ids, ids_2) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
tokens_2 = tokenizer.convert_ids_to_tokens(ids) new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
text_2 = tokenizer.decode(ids) 'pad_token': "<<<<<|||>|>>>>|>"}
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer)
tester.assertEqual(text_2, output_text) self.assertNotEqual(vocab_size_3, 0)
self.assertEqual(vocab_size, vocab_size_3)
self.assertEqual(added_toks_2, len(new_toks_2))
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tester.assertNotEqual(len(tokens_2), 0) tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
tester.assertIsInstance(text_2, (str, unicode))
self.assertGreaterEqual(len(tokens), 6)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[0], tokens[1])
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokens[-3])
self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
def create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
weights_list = list(tokenizer_class.max_model_input_sizes.keys())
weights_lists_2 = []
for file_id, map_list in tokenizer_class.pretrained_vocab_files_map.items():
weights_lists_2.append(list(map_list.keys()))
for weights_list_2 in weights_lists_2: def test_required_methods_tokenizer(self):
tester.assertListEqual(weights_list, weights_list_2) tokenizer = self.get_tokenizer()
input_text, output_text = self.get_input_output_texts()
tokens = tokenizer.tokenize(input_text)
ids = tokenizer.convert_tokens_to_ids(tokens)
ids_2 = tokenizer.encode(input_text)
self.assertListEqual(ids, ids_2)
def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs): tokens_2 = tokenizer.convert_ids_to_tokens(ids)
create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs) text_2 = tokenizer.decode(ids)
create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs) self.assertEqual(text_2, output_text)
create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs) self.assertNotEqual(len(tokens_2), 0)
self.assertIsInstance(text_2, (str, unicode))
def test_pretrained_model_lists(self):
weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
weights_lists_2 = []
for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
weights_lists_2.append(list(map_list.keys()))
for weights_list_2 in weights_lists_2:
self.assertListEqual(weights_list, weights_list_2)
...@@ -20,32 +20,39 @@ from io import open ...@@ -20,32 +20,39 @@ from io import open
from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
from.tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory from.tokenization_tests_commons import CommonTestCases
class TransfoXLTokenizationTest(unittest.TestCase): class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer_class = TransfoXLTokenizer
def setUp(self):
super(TransfoXLTokenizationTest, self).setUp()
def test_full_tokenizer(self):
vocab_tokens = [ vocab_tokens = [
"<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
"running", ",", "low", "l", "running", ",", "low", "l",
] ]
with TemporaryDirectory() as tmpdirname: self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file']) with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
with open(vocab_file, "w", encoding='utf-8') as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
input_text = u"<unk> UNwanted , running" def get_tokenizer(self):
output_text = u"<unk> unwanted, running" return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True)
create_and_check_tokenizer_commons(self, input_text, output_text, TransfoXLTokenizer, tmpdirname, lower_case=True) def get_input_output_texts(self):
input_text = u"<unk> UNwanted , running"
output_text = u"<unk> unwanted, running"
return input_text, output_text
tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True) def test_full_tokenizer(self):
tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
tokens = tokenizer.tokenize(u"<unk> UNwanted , running") tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"]) self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
self.assertListEqual( self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
def test_full_tokenizer_lower(self): def test_full_tokenizer_lower(self):
tokenizer = TransfoXLTokenizer(lower_case=True) tokenizer = TransfoXLTokenizer(lower_case=True)
......
...@@ -20,12 +20,16 @@ import json ...@@ -20,12 +20,16 @@ import json
from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory from .tokenization_tests_commons import CommonTestCases
class XLMTokenizationTest(unittest.TestCase): class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
def test_full_tokenizer(self): tokenizer_class = XLMTokenizer
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
def setUp(self):
super(XLMTokenizationTest, self).setUp()
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
"w</w>", "r</w>", "t</w>", "w</w>", "r</w>", "t</w>",
"lo", "low", "er</w>", "lo", "low", "er</w>",
...@@ -33,31 +37,46 @@ class XLMTokenizationTest(unittest.TestCase): ...@@ -33,31 +37,46 @@ class XLMTokenizationTest(unittest.TestCase):
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""] merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
with TemporaryDirectory() as tmpdirname: self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file']) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file']) with open(self.vocab_file, "w") as fp:
with open(vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
fp.write(json.dumps(vocab_tokens)) with open(self.merges_file, "w") as fp:
with open(merges_file, "w") as fp: fp.write("\n".join(merges))
fp.write("\n".join(merges))
def get_tokenizer(self):
return XLMTokenizer.from_pretrained(self.tmpdirname)
def get_input_output_texts(self):
input_text = u"lower newer"
output_text = u"lower newer"
return input_text, output_text
def test_full_tokenizer(self):
""" Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
input_text = u"lower newer" text = "lower"
output_text = u"lower newer" bpe_tokens = ["low", "er</w>"]
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
create_and_check_tokenizer_commons(self, input_text, output_text, XLMTokenizer, tmpdirname) input_tokens = tokens + ["<unk>"]
input_bpe_tokens = [14, 15, 20]
self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
tokenizer = XLMTokenizer(vocab_file, merges_file) def test_sequence_builders(self):
tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
text = "lower" text = tokenizer.encode("sequence builders")
bpe_tokens = ["low", "er</w>"] text_2 = tokenizer.encode("multi-sequence build")
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + ["<unk>"] encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
input_bpe_tokens = [14, 15, 20] encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
assert encoded_sentence == [1] + text + [1]
assert encoded_pair == [1] + text + [1] + text_2 + [1]
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -19,48 +19,58 @@ import unittest ...@@ -19,48 +19,58 @@ import unittest
from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE) from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory from .tokenization_tests_commons import CommonTestCases
SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'fixtures/test_sentencepiece.model') 'fixtures/test_sentencepiece.model')
class XLNetTokenizationTest(unittest.TestCase): class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
def test_full_tokenizer(self): tokenizer_class = XLNetTokenizer
def setUp(self):
super(XLNetTokenizationTest, self).setUp()
# We have a SentencePiece fixture for testing
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname)
with TemporaryDirectory() as tmpdirname: def get_tokenizer(self):
tokenizer.save_pretrained(tmpdirname) return XLNetTokenizer.from_pretrained(self.tmpdirname)
input_text = u"This is a test" def get_input_output_texts(self):
output_text = u"This is a test" input_text = u"This is a test"
output_text = u"This is a test"
return input_text, output_text
create_and_check_tokenizer_commons(self, input_text, output_text, XLNetTokenizer, tmpdirname)
tokens = tokenizer.tokenize(u'This is a test') def test_full_tokenizer(self):
self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est']) tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
self.assertListEqual( tokens = tokenizer.tokenize(u'This is a test')
tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") self.assertListEqual(
self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
ids = tokenizer.convert_tokens_to_ids(tokens)
self.assertListEqual(
ids, [8, 21, 84, 55, 24, 19, 7, 0,
602, 347, 347, 347, 3, 12, 66,
46, 72, 80, 6, 0, 4])
back_tokens = tokenizer.convert_ids_to_tokens(ids) tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
u'or', u'n', SPIECE_UNDERLINE + u'in', u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',', u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', ids = tokenizer.convert_tokens_to_ids(tokens)
u'<unk>', u'.']) self.assertListEqual(
ids, [8, 21, 84, 55, 24, 19, 7, 0,
602, 347, 347, 347, 3, 12, 66,
46, 72, 80, 6, 0, 4])
back_tokens = tokenizer.convert_ids_to_tokens(ids)
self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
u'or', u'n', SPIECE_UNDERLINE + u'in',
SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
u'<unk>', u'.'])
def test_tokenizer_lower(self): def test_tokenizer_lower(self):
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True) tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
...@@ -79,6 +89,18 @@ class XLNetTokenizationTest(unittest.TestCase): ...@@ -79,6 +89,18 @@ class XLNetTokenizationTest(unittest.TestCase):
u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.']) SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
def test_sequence_builders(self):
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
text = tokenizer.encode("sequence builders")
text_2 = tokenizer.encode("multi-sequence build")
encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
assert encoded_sentence == text + [4, 3]
assert encoded_pair == text + [4] + text_2 + [4, 3]
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Auto Model class. """
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
from .tokenization_bert import BertTokenizer
from .tokenization_openai import OpenAIGPTTokenizer
from .tokenization_gpt2 import GPT2Tokenizer
from .tokenization_transfo_xl import TransfoXLTokenizer
from .tokenization_xlnet import XLNetTokenizer
from .tokenization_xlm import XLMTokenizer
from .tokenization_roberta import RobertaTokenizer
logger = logging.getLogger(__name__)
class AutoTokenizer(object):
r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class
that will be instantiated as one of the tokenizer classes of the library
when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
class method.
The `from_pretrained()` method take care of returning the correct tokenizer class instance
using pattern matching on the `pretrained_model_name_or_path` string.
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlm`: XLMTokenizer (XLM model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
This class cannot be instantiated using `__init__()` (throw an error).
"""
def __init__(self):
raise EnvironmentError("AutoTokenizer is designed to be instantiated "
"using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
r""" Instantiate a one of the tokenizer classes of the library
from a pre-trained model vocabulary.
The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order):
- contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlm`: XLMTokenizer (XLM model)
- contains `roberta`: RobertaTokenizer (XLM model)
Params:
**pretrained_model_name_or_path**: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
- a path to a `directory` containing a configuration file saved
using the `save_pretrained(save_directory)` method.
- a path or url to a saved configuration `file`.
**cache_dir**: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
Examples::
config = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache.
config = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
"""
if 'roberta' in pretrained_model_name_or_path:
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'bert' in pretrained_model_name_or_path:
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'openai-gpt' in pretrained_model_name_or_path:
return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'gpt2' in pretrained_model_name_or_path:
return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'transfo-xl' in pretrained_model_name_or_path:
return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'xlnet' in pretrained_model_name_or_path:
return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'xlm' in pretrained_model_name_or_path:
return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
raise ValueError("Unrecognized model identifier in {}. Should contains one of "
"'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
"'xlm', 'roberta'".format(pretrained_model_name_or_path))
...@@ -22,7 +22,7 @@ import os ...@@ -22,7 +22,7 @@ import os
import unicodedata import unicodedata
from io import open from io import open
from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -67,10 +67,10 @@ def load_vocab(vocab_file): ...@@ -67,10 +67,10 @@ def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary.""" """Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict() vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader: with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.read().splitlines() tokens = reader.readlines()
for index, token in enumerate(tokens): for index, token in enumerate(tokens):
token = token.rstrip('\n')
vocab[token] = index vocab[token] = index
index += 1
return vocab return vocab
...@@ -86,7 +86,7 @@ def whitespace_tokenize(text): ...@@ -86,7 +86,7 @@ def whitespace_tokenize(text):
class BertTokenizer(PreTrainedTokenizer): class BertTokenizer(PreTrainedTokenizer):
r""" r"""
Constructs a BertTokenizer. Constructs a BertTokenizer.
:class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece :class:`~pytorch_transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
Args: Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file vocab_file: Path to a one-wordpiece-per-line vocabulary file
...@@ -119,12 +119,15 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -119,12 +119,15 @@ class BertTokenizer(PreTrainedTokenizer):
Only has an effect when do_basic_tokenize=True Only has an effect when do_basic_tokenize=True
**tokenize_chinese_chars**: (`optional`) boolean (default True) **tokenize_chinese_chars**: (`optional`) boolean (default True)
Whether to tokenize Chinese characters. Whether to tokenize Chinese characters.
This should likely be desactivated for Japanese: This should likely be deactivated for Japanese:
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
""" """
super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
pad_token=pad_token, cls_token=cls_token, pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, **kwargs) mask_token=mask_token, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
raise ValueError( raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
...@@ -166,11 +169,29 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -166,11 +169,29 @@ class BertTokenizer(PreTrainedTokenizer):
out_string = ' '.join(tokens).replace(' ##', '').strip() out_string = ' '.join(tokens).replace(' ##', '').strip()
return out_string return out_string
def add_special_tokens_single_sentence(self, token_ids):
"""
Adds special tokens to the a sequence for sequence classification tasks.
A BERT sequence has the following format: [CLS] X [SEP]
"""
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
"""
sep = [self._convert_token_to_id(self.sep_token)]
cls = [self._convert_token_to_id(self.cls_token)]
return cls + token_ids_0 + sep + token_ids_1 + sep
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file.""" """Save the tokenizer vocabulary to a directory or file."""
index = 0 index = 0
if os.path.isdir(vocab_path): if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
else:
vocab_file = vocab_path
with open(vocab_file, "w", encoding="utf-8") as writer: with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index: if index != token_index:
...@@ -214,7 +235,7 @@ class BasicTokenizer(object): ...@@ -214,7 +235,7 @@ class BasicTokenizer(object):
List of token not to split. List of token not to split.
**tokenize_chinese_chars**: (`optional`) boolean (default True) **tokenize_chinese_chars**: (`optional`) boolean (default True)
Whether to tokenize Chinese characters. Whether to tokenize Chinese characters.
This should likely be desactivated for Japanese: This should likely be deactivated for Japanese:
see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
""" """
if never_split is None: if never_split is None:
......
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for DistilBERT."""
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import logging
import os
import unicodedata
from io import open
from .tokenization_bert import BertTokenizer
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'distilbert-base-uncased': 512,
'distilbert-base-uncased-distilled-squad': 512,
}
class DistilBertTokenizer(BertTokenizer):
r"""
Constructs a DistilBertTokenizer.
:class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
minimum of this value (if specified) and the underlying BERT model's sequence length.
never_split: List of tokens which will never be split during tokenization. Only has an effect when
do_wordpiece_only=False
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
...@@ -31,7 +31,7 @@ except ImportError: ...@@ -31,7 +31,7 @@ except ImportError:
def lru_cache(): def lru_cache():
return lambda func: func return lambda func: func
from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -45,17 +45,20 @@ PRETRAINED_VOCAB_FILES_MAP = { ...@@ -45,17 +45,20 @@ PRETRAINED_VOCAB_FILES_MAP = {
{ {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
}, },
'merges_file': 'merges_file':
{ {
'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
}, },
} }
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'gpt2': 1024, 'gpt2': 1024,
'gpt2-medium': 1024, 'gpt2-medium': 1024,
'gpt2-large': 1024,
} }
@lru_cache() @lru_cache()
...@@ -102,9 +105,11 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -102,9 +105,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, merges_file, errors='replace', def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs): bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, **kwargs) super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
self.encoder = json.load(open(vocab_file)) self.encoder = json.load(open(vocab_file))
self.decoder = {v:k for k,v in self.encoder.items()} self.decoder = {v:k for k,v in self.encoder.items()}
...@@ -177,9 +182,7 @@ class GPT2Tokenizer(PreTrainedTokenizer): ...@@ -177,9 +182,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """ """ Converts a token (str/unicode) in an id using the vocab. """
if token in self.encoder: return self.encoder.get(token, self.encoder.get(self.unk_token))
return self.encoder.get(token)
return self.encoder.get(self.unk_token)
def _convert_id_to_token(self, index): def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab.""" """Converts an index (integer) in a token (string/unicode) using the vocab."""
......
...@@ -87,10 +87,14 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer): ...@@ -87,10 +87,14 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs): def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs) super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
try: try:
import ftfy import ftfy
import spacy from spacy.lang.en import English
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) _nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text self.fix_text = ftfy.fix_text
except ImportError: except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
......
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for RoBERTa."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import sys
import json
import logging
import os
import regex as re
from io import open
from .tokenization_gpt2 import bytes_to_unicode, get_pairs
from .tokenization_utils import PreTrainedTokenizer
try:
from functools import lru_cache
except ImportError:
# Just a dummy decorator to get the checks to run on python2
# because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
def lru_cache():
return lambda func: func
logger = logging.getLogger(__name__)
VOCAB_FILES_NAMES = {
'vocab_file': 'vocab.json',
'merges_file': 'merges.txt',
}
PRETRAINED_VOCAB_FILES_MAP = {
'vocab_file':
{
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
},
'merges_file':
{
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'roberta-base': 512,
'roberta-large': 512,
'roberta-large-mnli': 512,
}
class RobertaTokenizer(PreTrainedTokenizer):
"""
RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
mask_token=mask_token, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens
self.encoder = json.load(open(vocab_file, encoding="utf-8"))
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
@property
def vocab_size(self):
return len(self.encoder)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
""" Tokenize a string. """
bpe_tokens = []
for token in re.findall(self.pat, text):
if sys.version_info[0] == 2:
token = ''.join(self.byte_encoder[ord(b)] for b in token)
else:
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
return bpe_tokens
def _convert_token_to_id(self, token):
""" Converts a token (str/unicode) in an id using the vocab. """
return self.encoder.get(token, self.encoder.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (string/unicode) using the vocab."""
return self.decoder.get(index)
def convert_tokens_to_string(self, tokens):
""" Converts a sequence of tokens (string) in a single string. """
text = ''.join(tokens)
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
return text
def add_special_tokens_single_sentence(self, token_ids):
"""
Adds special tokens to a sequence for sequence classification tasks.
A RoBERTa sequence has the following format: <s> X </s>
"""
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
"""
sep = [self._convert_token_to_id(self.sep_token)]
cls = [self._convert_token_to_id(self.cls_token)]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory."""
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
with open(vocab_file, 'w', encoding='utf-8') as f:
f.write(json.dumps(self.encoder, ensure_ascii=False))
index = 0
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write(u'#version: 0.2\n')
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!".format(merge_file))
index = token_index
writer.write(' '.join(bpe_tokens) + u'\n')
index += 1
return vocab_file, merge_file
...@@ -30,7 +30,7 @@ import torch ...@@ -30,7 +30,7 @@ import torch
import numpy as np import numpy as np
from .file_utils import cached_path from .file_utils import cached_path
from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization from .tokenization_utils import PreTrainedTokenizer
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
import cPickle as pickle import cPickle as pickle
...@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer): ...@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token, super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
**kwargs) **kwargs)
self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
if never_split is None: if never_split is None:
never_split = self.all_special_tokens never_split = self.all_special_tokens
if special is None: if special is None:
......
...@@ -30,14 +30,34 @@ SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json' ...@@ -30,14 +30,34 @@ SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
ADDED_TOKENS_FILE = 'added_tokens.json' ADDED_TOKENS_FILE = 'added_tokens.json'
class PreTrainedTokenizer(object): class PreTrainedTokenizer(object):
""" An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary. """ Base class for all tokenizers.
Handle all the shared methods for tokenization and special tokens as well as methods dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
Derived class can set up a few special tokens to be used in common scripts and internals: This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
bos_token, eos_token, EOP_TOKEN, EOD_TOKEN, unk_token, sep_token, pad_token, cls_token, mask_token
additional_special_tokens = []
We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the Class attributes (overridden by derived classes):
specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
- ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
- ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
- ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
Parameters:
- ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token``
- ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token``
- ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token``
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token``
- ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token``
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token``
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token``
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens``
""" """
vocab_files_names = {} vocab_files_names = {}
pretrained_vocab_files_map = {} pretrained_vocab_files_map = {}
...@@ -49,48 +69,56 @@ class PreTrainedTokenizer(object): ...@@ -49,48 +69,56 @@ class PreTrainedTokenizer(object):
@property @property
def bos_token(self): def bos_token(self):
""" Beginning of sentence token (string). Log an error if used while not having been set. """
if self._bos_token is None: if self._bos_token is None:
logger.error("Using bos_token, but it is not set yet.") logger.error("Using bos_token, but it is not set yet.")
return self._bos_token return self._bos_token
@property @property
def eos_token(self): def eos_token(self):
""" End of sentence token (string). Log an error if used while not having been set. """
if self._eos_token is None: if self._eos_token is None:
logger.error("Using eos_token, but it is not set yet.") logger.error("Using eos_token, but it is not set yet.")
return self._eos_token return self._eos_token
@property @property
def unk_token(self): def unk_token(self):
""" Unknown token (string). Log an error if used while not having been set. """
if self._unk_token is None: if self._unk_token is None:
logger.error("Using unk_token, but it is not set yet.") logger.error("Using unk_token, but it is not set yet.")
return self._unk_token return self._unk_token
@property @property
def sep_token(self): def sep_token(self):
""" Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
if self._sep_token is None: if self._sep_token is None:
logger.error("Using sep_token, but it is not set yet.") logger.error("Using sep_token, but it is not set yet.")
return self._sep_token return self._sep_token
@property @property
def pad_token(self): def pad_token(self):
""" Padding token (string). Log an error if used while not having been set. """
if self._pad_token is None: if self._pad_token is None:
logger.error("Using pad_token, but it is not set yet.") logger.error("Using pad_token, but it is not set yet.")
return self._pad_token return self._pad_token
@property @property
def cls_token(self): def cls_token(self):
""" Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
if self._cls_token is None: if self._cls_token is None:
logger.error("Using cls_token, but it is not set yet.") logger.error("Using cls_token, but it is not set yet.")
return self._cls_token return self._cls_token
@property @property
def mask_token(self): def mask_token(self):
""" Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
if self._mask_token is None: if self._mask_token is None:
logger.error("Using mask_token, but it is not set yet.") logger.error("Using mask_token, but it is not set yet.")
return self._mask_token return self._mask_token
@property @property
def additional_special_tokens(self): def additional_special_tokens(self):
""" All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
if self._additional_special_tokens is None: if self._additional_special_tokens is None:
logger.error("Using additional_special_tokens, but it is not set yet.") logger.error("Using additional_special_tokens, but it is not set yet.")
return self._additional_special_tokens return self._additional_special_tokens
...@@ -138,48 +166,119 @@ class PreTrainedTokenizer(object): ...@@ -138,48 +166,119 @@ class PreTrainedTokenizer(object):
self._additional_special_tokens = [] self._additional_special_tokens = []
self.max_len = max_len if max_len is not None else int(1e12) self.max_len = max_len if max_len is not None else int(1e12)
self.max_len_single_sentence = self.max_len
self.max_len_sentences_pair = self.max_len
self.added_tokens_encoder = {} self.added_tokens_encoder = {}
self.added_tokens_decoder = {} self.added_tokens_decoder = {}
for key, value in kwargs.items(): for key, value in kwargs.items():
if key in self.SPECIAL_TOKENS_ATTRIBUTES: if key in self.SPECIAL_TOKENS_ATTRIBUTES:
if key == 'additional_special_tokens':
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
else:
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
setattr(self, key, value) setattr(self, key, value)
@classmethod @classmethod
def from_pretrained(cls, *inputs, **kwargs): def from_pretrained(cls, *inputs, **kwargs):
r"""
Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
Args:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
- (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the vocabulary files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
Examples::
# We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
# Download vocabulary from S3 and cache.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
# If the tokenizer uses a single vocabulary file, you can point directly to this file
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
# You can link tokens to special vocabulary when instantiating
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
# You should be sure '<unk>' is in the vocabulary when doing that.
# Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
assert tokenizer.unk_token == '<unk>'
"""
return cls._from_pretrained(*inputs, **kwargs) return cls._from_pretrained(*inputs, **kwargs)
@classmethod @classmethod
def _from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
""" cache_dir = kwargs.pop('cache_dir', None)
Instantiate a PreTrainedTokenizer from pre-trained vocabulary files. force_download = kwargs.pop('force_download', False)
Download and cache the vocabulary files if needed. proxies = kwargs.pop('proxies', None)
"""
s3_models = list(cls.max_model_input_sizes.keys()) s3_models = list(cls.max_model_input_sizes.keys())
vocab_files = {} vocab_files = {}
if pretrained_model_name_or_path in s3_models: if pretrained_model_name_or_path in s3_models:
# Get the vocabulary from AWS S3 bucket
for file_id, map_list in cls.pretrained_vocab_files_map.items(): for file_id, map_list in cls.pretrained_vocab_files_map.items():
vocab_files[file_id] = map_list[pretrained_model_name_or_path] vocab_files[file_id] = map_list[pretrained_model_name_or_path]
else: else:
# Get the vocabulary from local files
logger.info( logger.info(
"Model name '{}' not found in model shortcut name list ({}). " "Model name '{}' not found in model shortcut name list ({}). "
"Assuming '{}' is a path or url to a directory containing tokenizer files.".format( "Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
pretrained_model_name_or_path, ', '.join(s3_models), pretrained_model_name_or_path, ', '.join(s3_models),
pretrained_model_name_or_path)) pretrained_model_name_or_path))
all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE} # Look for the tokenizer main vocabulary files
all_vocab_files_names.update(cls.vocab_files_names) for file_id, file_name in cls.vocab_files_names.items():
for file_id, file_name in all_vocab_files_names.items():
if os.path.isdir(pretrained_model_name_or_path): if os.path.isdir(pretrained_model_name_or_path):
# If a directory is provided we look for the standard filenames
full_file_name = os.path.join(pretrained_model_name_or_path, file_name) full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
else: else:
# If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
full_file_name = pretrained_model_name_or_path full_file_name = pretrained_model_name_or_path
if not os.path.exists(full_file_name): if not os.path.exists(full_file_name):
logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
full_file_name = None full_file_name = None
vocab_files[file_id] = full_file_name vocab_files[file_id] = full_file_name
# Look for the additional tokens files
all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
# If a path to a file was provided, get the parent directory
saved_directory = pretrained_model_name_or_path
if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
saved_directory = os.path.dirname(saved_directory)
for file_id, file_name in all_vocab_files_names.items():
full_file_name = os.path.join(saved_directory, file_name)
if not os.path.exists(full_file_name):
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
full_file_name = None
vocab_files[file_id] = full_file_name
if all(full_file_name is None for full_file_name in vocab_files.values()): if all(full_file_name is None for full_file_name in vocab_files.values()):
logger.error( logger.error(
"Model name '{}' was not found in model name list ({}). " "Model name '{}' was not found in model name list ({}). "
...@@ -196,8 +295,8 @@ class PreTrainedTokenizer(object): ...@@ -196,8 +295,8 @@ class PreTrainedTokenizer(object):
if file_path is None: if file_path is None:
resolved_vocab_files[file_id] = None resolved_vocab_files[file_id] = None
else: else:
resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir) resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError: except EnvironmentError as e:
if pretrained_model_name_or_path in s3_models: if pretrained_model_name_or_path in s3_models:
logger.error("Couldn't reach server to download vocabulary.") logger.error("Couldn't reach server to download vocabulary.")
else: else:
...@@ -207,7 +306,7 @@ class PreTrainedTokenizer(object): ...@@ -207,7 +306,7 @@ class PreTrainedTokenizer(object):
"at this path or url.".format( "at this path or url.".format(
pretrained_model_name_or_path, ', '.join(s3_models), pretrained_model_name_or_path, ', '.join(s3_models),
pretrained_model_name_or_path, str(vocab_files.keys()))) pretrained_model_name_or_path, str(vocab_files.keys())))
return None raise e
for file_id, file_path in vocab_files.items(): for file_id, file_path in vocab_files.items():
if file_path == resolved_vocab_files[file_id]: if file_path == resolved_vocab_files[file_id]:
...@@ -251,8 +350,9 @@ class PreTrainedTokenizer(object): ...@@ -251,8 +350,9 @@ class PreTrainedTokenizer(object):
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save the tokenizer vocabulary files (with added tokens) and the """ Save the tokenizer vocabulary files (with added tokens) and the
special-tokens-to-class-attributes-mapping to a directory, so that it special-tokens-to-class-attributes-mapping to a directory.
can be re-loaded using the `from_pretrained(save_directory)` class method.
This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
""" """
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
logger.error("Saving directory ({}) should be a directory".format(save_directory)) logger.error("Saving directory ({}) should be a directory".format(save_directory))
...@@ -266,7 +366,7 @@ class PreTrainedTokenizer(object): ...@@ -266,7 +366,7 @@ class PreTrainedTokenizer(object):
with open(added_tokens_file, 'w', encoding='utf-8') as f: with open(added_tokens_file, 'w', encoding='utf-8') as f:
if self.added_tokens_encoder: if self.added_tokens_encoder:
out_str = json.dumps(self.added_tokens_decoder, ensure_ascii=False) out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
else: else:
out_str = u"{}" out_str = u"{}"
f.write(out_str) f.write(out_str)
...@@ -277,38 +377,53 @@ class PreTrainedTokenizer(object): ...@@ -277,38 +377,53 @@ class PreTrainedTokenizer(object):
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the tokenizer vocabulary to a directory. This method doesn't save added tokens """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
and special token mappings. and special token mappings.
Please use `save_pretrained()` to save the full Tokenizer state so that it can be Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
reloaded using the `from_pretrained(save_directory)` class method.
""" """
raise NotImplementedError raise NotImplementedError
def vocab_size(self): def vocab_size(self):
""" Size of the base vocabulary (without the added tokens) """
raise NotImplementedError raise NotImplementedError
def __len__(self): def __len__(self):
""" Size of the full vocabulary with the added tokens """
return self.vocab_size + len(self.added_tokens_encoder) return self.vocab_size + len(self.added_tokens_encoder)
def add_tokens(self, new_tokens): def add_tokens(self, new_tokens):
""" Add a list of new tokens to the tokenizer class. If the new tokens are not in the """
vocabulary, they are added to the added_tokens_encoder with indices starting from Add a list of new tokens to the tokenizer class. If the new tokens are not in the
the last index of the current vocabulary. vocabulary, they are added to it with indices starting from length of the current vocabulary.
Args:
new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
Returns:
Number of tokens added to the vocabulary.
Examples::
# Let's see how to increase the vocabulary of Bert model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
Returns: num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
Number of tokens added to the vocabulary which can be used to correspondingly print('We have added', num_added_toks, 'tokens')
increase the size of the associated model embedding matrices. model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
""" """
if not new_tokens: if not new_tokens:
return 0 return 0
to_add_tokens = [] to_add_tokens = []
for token in new_tokens: for token in new_tokens:
if self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token): assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
if token != self.unk_token and \
self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
to_add_tokens.append(token) to_add_tokens.append(token)
logger.info("Adding %s to the vocabulary", token) logger.info("Adding %s to the vocabulary", token)
...@@ -321,24 +436,51 @@ class PreTrainedTokenizer(object): ...@@ -321,24 +436,51 @@ class PreTrainedTokenizer(object):
def add_special_tokens(self, special_tokens_dict): def add_special_tokens(self, special_tokens_dict):
""" Add a dictionnary of special tokens (eos, pad, cls...) to the encoder and link them """
to class attributes. If the special tokens are not in the vocabulary, they are added Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
to it and indexed starting from the last index of the current vocabulary. to class attributes. If special tokens are NOT in the vocabulary, they are added
to it (indexed starting from the last index of the current vocabulary).
Args:
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
[``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
``additional_special_tokens``].
Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
Returns:
Number of tokens added to the vocabulary.
Examples::
Returns: # Let's see how to add a new classification token to GPT-2
Number of tokens added to the vocabulary which can be used to correspondingly tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
increase the size of the associated model embedding matrices. model = GPT2Model.from_pretrained('gpt2')
special_tokens_dict = {'cls_token': '<CLS>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
assert tokenizer.cls_token == '<CLS>'
""" """
if not special_tokens_dict: if not special_tokens_dict:
return 0 return 0
added_special_tokens = self.add_tokens(special_tokens_dict.values()) added_tokens = 0
for key, value in special_tokens_dict.items(): for key, value in special_tokens_dict.items():
assert key in self.SPECIAL_TOKENS_ATTRIBUTES
if key == 'additional_special_tokens':
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
added_tokens += self.add_tokens(value)
else:
assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
added_tokens += self.add_tokens([value])
logger.info("Assigning %s to the %s key of the tokenizer", value, key) logger.info("Assigning %s to the %s key of the tokenizer", value, key)
setattr(self, key, value) setattr(self, key, value)
return added_special_tokens return added_tokens
def tokenize(self, text, **kwargs): def tokenize(self, text, **kwargs):
""" Converts a string in a sequence of tokens (string), using the tokenizer. """ Converts a string in a sequence of tokens (string), using the tokenizer.
...@@ -347,15 +489,45 @@ class PreTrainedTokenizer(object): ...@@ -347,15 +489,45 @@ class PreTrainedTokenizer(object):
Take care of added tokens. Take care of added tokens.
""" """
def split_on_token(tok, text):
result = []
split_text = text.split(tok)
for i, sub_text in enumerate(split_text):
sub_text = sub_text.strip()
if i == 0 and not sub_text:
result += [tok]
elif i == len(split_text) - 1:
if sub_text:
result += [sub_text]
else:
pass
else:
if sub_text:
result += [sub_text]
result += [tok]
return result
def split_on_tokens(tok_list, text): def split_on_tokens(tok_list, text):
if not text: if not text:
return [] return []
if not tok_list: if not tok_list:
return self._tokenize(text, **kwargs) return self._tokenize(text, **kwargs)
tok = tok_list[0]
split_text = text.split(tok) tokenized_text = []
return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \ text_list = [text]
for sub_text in split_text), [])[:-1] for tok in tok_list:
tokenized_text = []
for sub_text in text_list:
if sub_text not in self.added_tokens_encoder \
and sub_text not in self.all_special_tokens:
tokenized_text += split_on_token(tok, sub_text)
else:
tokenized_text += [sub_text]
text_list = tokenized_text
return sum((self._tokenize(token, **kwargs) if token not \
in self.added_tokens_encoder and token not in self.all_special_tokens \
else [token] for token in tokenized_text), [])
added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
tokenized_text = split_on_tokens(added_tokens, text) tokenized_text = split_on_tokens(added_tokens, text)
...@@ -366,13 +538,13 @@ class PreTrainedTokenizer(object): ...@@ -366,13 +538,13 @@ class PreTrainedTokenizer(object):
Split in words for word-based vocabulary or sub-words for sub-word-based Split in words for word-based vocabulary or sub-words for sub-word-based
vocabularies (BPE/SentencePieces/WordPieces). vocabularies (BPE/SentencePieces/WordPieces).
Don't take care of added tokens. Do NOT take care of added tokens.
""" """
raise NotImplementedError raise NotImplementedError
def convert_tokens_to_ids(self, tokens): def convert_tokens_to_ids(self, tokens):
""" Converts a single token or a sequence of tokens (str/unicode) in a integer id """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
(resp.) a sequence of ids, using the vocabulary. (resp. a sequence of ids), using the vocabulary.
""" """
if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
return self._convert_token_to_id_with_added_voc(tokens) return self._convert_token_to_id_with_added_voc(tokens)
...@@ -394,13 +566,39 @@ class PreTrainedTokenizer(object): ...@@ -394,13 +566,39 @@ class PreTrainedTokenizer(object):
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
raise NotImplementedError raise NotImplementedError
def encode(self, text, text_pair=None, add_special_tokens=False):
def encode(self, text): """
""" Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
same as self.convert_tokens_to_ids(self.tokenize(text)).
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Args:
text: The first sequence to be encoded.
text_pair: Optional second sequence to be encoded.
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
""" """
return self.convert_tokens_to_ids(self.tokenize(text)) if text_pair is None:
if add_special_tokens:
return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
else:
return self.convert_tokens_to_ids(self.tokenize(text))
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)]
if add_special_tokens:
return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
else:
return first_sentence_tokens, second_sentence_tokens
def add_special_tokens_single_sentence(self, token_ids):
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
return token_ids
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
return token_ids_0 + token_ids_1
def convert_ids_to_tokens(self, ids, skip_special_tokens=False): def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
""" Converts a single index or a sequence of indices (integers) in a token " """ Converts a single index or a sequence of indices (integers) in a token "
...@@ -435,14 +633,28 @@ class PreTrainedTokenizer(object): ...@@ -435,14 +633,28 @@ class PreTrainedTokenizer(object):
return ' '.join(self.convert_ids_to_tokens(tokens)) return ' '.join(self.convert_ids_to_tokens(tokens))
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
""" Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary """
with options to remove special tokens and clean up tokenization spaces. Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
with options to remove special tokens and clean up tokenization spaces.
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
""" """
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
text = self.convert_tokens_to_string(filtered_tokens) text = self.convert_tokens_to_string(filtered_tokens)
if clean_up_tokenization_spaces:
text = clean_up_tokenization(text) if self._sep_token is not None and self._sep_token in text:
return text text = text.replace(self._cls_token, self._sep_token)
split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self._sep_token)))
if clean_up_tokenization_spaces:
clean_text = [self.clean_up_tokenization(text) for text in split_text]
return clean_text
else:
return split_text
else:
if clean_up_tokenization_spaces:
clean_text = self.clean_up_tokenization(text)
return clean_text
else:
return text
@property @property
def special_tokens_map(self): def special_tokens_map(self):
...@@ -474,13 +686,14 @@ class PreTrainedTokenizer(object): ...@@ -474,13 +686,14 @@ class PreTrainedTokenizer(object):
class attributes (cls_token, unk_token...). class attributes (cls_token, unk_token...).
""" """
all_toks = self.all_special_tokens all_toks = self.all_special_tokens
all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks) all_ids = list(self._convert_token_to_id(t) for t in all_toks)
return all_ids return all_ids
@staticmethod
def clean_up_tokenization(out_string):
def clean_up_tokenization(out_string): """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',' """
).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re") ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
return out_string ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
return out_string
...@@ -122,10 +122,15 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -122,10 +122,15 @@ class XLMTokenizer(PreTrainedTokenizer):
cls_token=cls_token, mask_token=mask_token, cls_token=cls_token, mask_token=mask_token,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
**kwargs) **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
try: try:
import ftfy import ftfy
import spacy from spacy.lang.en import English
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) _nlp = English()
self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
self.fix_text = ftfy.fix_text self.fix_text = ftfy.fix_text
except ImportError: except ImportError:
logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
...@@ -214,6 +219,22 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -214,6 +219,22 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace('</w>', ' ').strip() out_string = ''.join(tokens).replace('</w>', ' ').strip()
return out_string return out_string
def add_special_tokens_single_sentence(self, token_ids):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLM sequence has the following format: [CLS] X [SEP]
"""
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
"""
sep = [self._convert_token_to_id(self.sep_token)]
cls = [self._convert_token_to_id(self.cls_token)]
return cls + token_ids_0 + sep + token_ids_1 + sep
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """Save the tokenizer vocabulary and merge files to a directory."""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
......
...@@ -23,7 +23,7 @@ from shutil import copyfile ...@@ -23,7 +23,7 @@ from shutil import copyfile
import unicodedata import unicodedata
import six import six
from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization from .tokenization_utils import PreTrainedTokenizer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
pad_token=pad_token, cls_token=cls_token, pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, additional_special_tokens= mask_token=mask_token, additional_special_tokens=
additional_special_tokens, **kwargs) additional_special_tokens, **kwargs)
self.max_len_single_sentence = self.max_len - 2 # take into account special tokens
self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens
try: try:
import sentencepiece as spm import sentencepiece as spm
except ImportError: except ImportError:
...@@ -177,6 +181,24 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -177,6 +181,24 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
return out_string return out_string
def add_special_tokens_single_sentence(self, token_ids):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
"""
sep = [self._convert_token_to_id(self.sep_token)]
cls = [self._convert_token_to_id(self.cls_token)]
return token_ids + sep + cls
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLNet sequence has the following format: X [SEP][CLS]
"""
sep = [self._convert_token_to_id(self.sep_token)]
cls = [self._convert_token_to_id(self.cls_token)]
return token_ids_0 + sep + token_ids_1 + sep + cls
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file """ Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory. to a directory.
......
# PyTorch # PyTorch
torch>=0.4.1 torch>=1.0.0
# progress bars in model download and training scripts # progress bars in model download and training scripts
tqdm tqdm
# Accessing files from S3 directly. # Accessing files from S3 directly.
......
...@@ -38,10 +38,10 @@ from setuptools import find_packages, setup ...@@ -38,10 +38,10 @@ from setuptools import find_packages, setup
setup( setup(
name="pytorch_transformers", name="pytorch_transformers",
version="1.0.0", version="1.1.0",
author="Thomas Wolf, Lysandre Debut, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors", author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
author_email="thomas@huggingface.co", author_email="thomas@huggingface.co",
description="Repository of pre-trained NLP Transformer models: BERT, GPT & GPT-2, Transformer-XL, XLNet and XLM", description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
long_description=open("README.md", "r", encoding='utf-8').read(), long_description=open("README.md", "r", encoding='utf-8').read(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU', keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
...@@ -49,7 +49,7 @@ setup( ...@@ -49,7 +49,7 @@ setup(
url="https://github.com/huggingface/pytorch-transformers", url="https://github.com/huggingface/pytorch-transformers",
packages=find_packages(exclude=["*.tests", "*.tests.*", packages=find_packages(exclude=["*.tests", "*.tests.*",
"tests.*", "tests"]), "tests.*", "tests"]),
install_requires=['torch>=0.4.1', install_requires=['torch>=1.0.0',
'numpy', 'numpy',
'boto3', 'boto3',
'requests', 'requests',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment