Merge branch 'master' into master

0a2fecdf · Thomas Wolf · GitHub · 39eb31e1 · e0caab0c · 0a2fecdf
Unverified Commit 0a2fecdf authored Aug 30, 2019 by Thomas Wolf Committed by GitHub Aug 30, 2019
18 changed files
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -20,13 +20,17 @@ import json
 from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
-class OpenAIGPTTokenizationTest(unittest.TestCase):
+class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
-    def test_full_tokenizer(self):
+    tokenizer_class = OpenAIGPTTokenizer
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+    def setUp(self):
+        super(OpenAIGPTTokenizationTest, self).setUp()
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                 "w</w>", "r</w>", "t</w>",
                 "lo", "low", "er</w>",
@@ -34,30 +38,34 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
-        with TemporaryDirectory() as tmpdirname:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
-            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
-            with open(vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
-                fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
-            with open(merges_file, "w") as fp:
+            fp.write("\n".join(merges))
-                fp.write("\n".join(merges))
-            input_text = u"lower newer"
+    def get_tokenizer(self):
-            output_text = u"lower newer"
+        return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname)
-            create_and_check_tokenizer_commons(self, input_text, output_text, OpenAIGPTTokenizer, tmpdirname)
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u"lower newer"
+        return input_text, output_text
-            tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file)
-            text = "lower"
+    def test_full_tokenizer(self):
-            bpe_tokens = ["low", "er</w>"]
+        tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
-            tokens = tokenizer.tokenize(text)
-            self.assertListEqual(tokens, bpe_tokens)
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
-            input_tokens = tokens + ["<unk>"]
+        input_tokens = tokens + ["<unk>"]
-            input_bpe_tokens = [14, 15, 20]
+        input_bpe_tokens = [14, 15, 20]
-            self.assertListEqual(
+        self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 if __name__ == '__main__':

--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os
+import json
+import unittest
+from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
+from .tokenization_tests_commons import CommonTestCases
+class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
+    tokenizer_class = RobertaTokenizer
+    def setUp(self):
+        super(RobertaTokenizationTest, self).setUp()
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "lo", "low", "er",
+                 "low", "lowest", "newer", "wider", "<unk>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
+    def get_tokenizer(self):
+        return RobertaTokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u"lower<unk>newer"
+        return input_text, output_text
+    def test_full_tokenizer(self):
+        tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower"
+        bpe_tokens = ["low", "er"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [13, 12, 17]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+    def roberta_dict_integration_testing(self):
+        tokenizer = self.get_tokenizer()
+        self.assertListEqual(
+            tokenizer.encode('Hello world!'),
+            [0, 31414, 232, 328, 2]
+        )
+        self.assertListEqual(
+            tokenizer.encode('Hello world! cécé herlolip 418'),
+            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
+        )
+    def test_sequence_builders(self):
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+        encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
+        encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+        assert encoded_sentence == encoded_text_from_decode
+        assert encoded_pair == encoded_pair_from_decode
+if __name__ == '__main__':
+    unittest.main()
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -19,6 +19,7 @@ import sys
 from io import open
 import tempfile
 import shutil
+import unittest
 if sys.version_info[0] == 2:
    import cPickle as pickle
@@ -36,113 +37,124 @@ else:
    unicode = str
-def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+class CommonTestCases:
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
-    before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+    class CommonTokenizerTester(unittest.TestCase):
-    with TemporaryDirectory() as tmpdirname:
+        tokenizer_class = None
-        tokenizer.save_pretrained(tmpdirname)
-        tokenizer = tokenizer.from_pretrained(tmpdirname)
-    after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+        def setUp(self):
-    tester.assertListEqual(before_tokens, after_tokens)
+            self.tmpdirname = tempfile.mkdtemp()
-def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+        def tearDown(self):
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+            shutil.rmtree(self.tmpdirname)
-    tester.assertIsNotNone(tokenizer)
-    text = u"Munich and Berlin are nice cities"
+        def get_tokenizer(self):
-    subwords = tokenizer.tokenize(text)
+            raise NotImplementedError
-    with TemporaryDirectory() as tmpdirname:
+        def get_input_output_texts(self):
+            raise NotImplementedError
-        filename = os.path.join(tmpdirname, u"tokenizer.bin")
+        def test_save_and_load_tokenizer(self):
-        pickle.dump(tokenizer, open(filename, "wb"))
+            tokenizer = self.get_tokenizer()
-        tokenizer_new = pickle.load(open(filename, "rb"))
+            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
-    subwords_loaded = tokenizer_new.tokenize(text)
+            with TemporaryDirectory() as tmpdirname:
+                tokenizer.save_pretrained(tmpdirname)
+                tokenizer = tokenizer.from_pretrained(tmpdirname)
-    tester.assertListEqual(subwords, subwords_loaded)
+            after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+            self.assertListEqual(before_tokens, after_tokens)
+        def test_pickle_tokenizer(self):
+            tokenizer = self.get_tokenizer()
+            self.assertIsNotNone(tokenizer)
-def create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+            text = u"Munich and Berlin are nice cities"
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+            subwords = tokenizer.tokenize(text)
-    vocab_size = tokenizer.vocab_size
+            with TemporaryDirectory() as tmpdirname:
-    all_size = len(tokenizer)
-    tester.assertNotEqual(vocab_size, 0)
+                filename = os.path.join(tmpdirname, u"tokenizer.bin")
-    tester.assertEqual(vocab_size, all_size)
+                pickle.dump(tokenizer, open(filename, "wb"))
-    new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
+                tokenizer_new = pickle.load(open(filename, "rb"))
-    added_toks = tokenizer.add_tokens(new_toks)
-    vocab_size_2 = tokenizer.vocab_size
-    all_size_2 = len(tokenizer)
-    tester.assertNotEqual(vocab_size_2, 0)
+            subwords_loaded = tokenizer_new.tokenize(text)
-    tester.assertEqual(vocab_size, vocab_size_2)
-    tester.assertEqual(added_toks, len(new_toks))
-    tester.assertEqual(all_size_2, all_size + len(new_toks))
-    tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
+            self.assertListEqual(subwords, subwords_loaded)
-    tester.assertGreaterEqual(len(tokens), 4)
-    tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-    tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-    new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-                  'pad_token': "<<<<<|||>|>>>>|>"}
-    added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-    vocab_size_3 = tokenizer.vocab_size
-    all_size_3 = len(tokenizer)
-    tester.assertNotEqual(vocab_size_3, 0)
+        def test_add_tokens_tokenizer(self):
-    tester.assertEqual(vocab_size, vocab_size_3)
+            tokenizer = self.get_tokenizer()
-    tester.assertEqual(added_toks_2, len(new_toks_2))
-    tester.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-    tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
+            vocab_size = tokenizer.vocab_size
+            all_size = len(tokenizer)
-    tester.assertGreaterEqual(len(tokens), 6)
+            self.assertNotEqual(vocab_size, 0)
-    tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+            self.assertEqual(vocab_size, all_size)
-    tester.assertGreater(tokens[0], tokens[1])
-    tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-    tester.assertGreater(tokens[-2], tokens[-3])
-    tester.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
-    tester.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
+            new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
+            added_toks = tokenizer.add_tokens(new_toks)
+            vocab_size_2 = tokenizer.vocab_size
+            all_size_2 = len(tokenizer)
-def create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
+            self.assertNotEqual(vocab_size_2, 0)
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+            self.assertEqual(vocab_size, vocab_size_2)
+            self.assertEqual(added_toks, len(new_toks))
+            self.assertEqual(all_size_2, all_size + len(new_toks))
-    tokens = tokenizer.tokenize(input_text)
+            tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
-    ids = tokenizer.convert_tokens_to_ids(tokens)
+            self.assertGreaterEqual(len(tokens), 4)
-    ids_2 = tokenizer.encode(input_text)
+            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-    tester.assertListEqual(ids, ids_2)
+            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-    tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+            new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-    text_2 = tokenizer.decode(ids)
+                        'pad_token': "<<<<<|||>|>>>>|>"}
+            added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+            vocab_size_3 = tokenizer.vocab_size
+            all_size_3 = len(tokenizer)
-    tester.assertEqual(text_2, output_text)
+            self.assertNotEqual(vocab_size_3, 0)
+            self.assertEqual(vocab_size, vocab_size_3)
+            self.assertEqual(added_toks_2, len(new_toks_2))
+            self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-    tester.assertNotEqual(len(tokens_2), 0)
+            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
-    tester.assertIsInstance(text_2, (str, unicode))
+            self.assertGreaterEqual(len(tokens), 6)
+            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+            self.assertGreater(tokens[0], tokens[1])
+            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+            self.assertGreater(tokens[-2], tokens[-3])
+            self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
+            self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
-def create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
-    weights_list = list(tokenizer_class.max_model_input_sizes.keys())
-    weights_lists_2 = []
-    for file_id, map_list in tokenizer_class.pretrained_vocab_files_map.items():
-        weights_lists_2.append(list(map_list.keys()))
-    for weights_list_2 in weights_lists_2:
+        def test_required_methods_tokenizer(self):
-        tester.assertListEqual(weights_list, weights_list_2)
+            tokenizer = self.get_tokenizer()
+            input_text, output_text = self.get_input_output_texts()
+            tokens = tokenizer.tokenize(input_text)
+            ids = tokenizer.convert_tokens_to_ids(tokens)
+            ids_2 = tokenizer.encode(input_text)
+            self.assertListEqual(ids, ids_2)
-def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
+            tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-    create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
+            text_2 = tokenizer.decode(ids)
-    create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
-    create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
+            self.assertEqual(text_2, output_text)
-    create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
-    create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
+            self.assertNotEqual(len(tokens_2), 0)
+            self.assertIsInstance(text_2, (str, unicode))
+        def test_pretrained_model_lists(self):
+            weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
+            weights_lists_2 = []
+            for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
+                weights_lists_2.append(list(map_list.keys()))
+            for weights_list_2 in weights_lists_2:
+                self.assertListEqual(weights_list, weights_list_2)
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -20,32 +20,39 @@ from io import open
 from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
-from.tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from.tokenization_tests_commons import CommonTestCases
-class TransfoXLTokenizationTest(unittest.TestCase):
+class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
+    tokenizer_class = TransfoXLTokenizer
+    def setUp(self):
+        super(TransfoXLTokenizationTest, self).setUp()
-    def test_full_tokenizer(self):
        vocab_tokens = [
            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
            "running", ",", "low", "l",
        ]
-        with TemporaryDirectory() as tmpdirname:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
-            with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-            input_text = u"<unk> UNwanted , running"
+    def get_tokenizer(self):
-            output_text = u"<unk> unwanted, running"
+        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True)
-            create_and_check_tokenizer_commons(self, input_text, output_text, TransfoXLTokenizer, tmpdirname, lower_case=True)
+    def get_input_output_texts(self):
+        input_text = u"<unk> UNwanted , running"
+        output_text = u"<unk> unwanted, running"
+        return input_text, output_text
-            tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
+    def test_full_tokenizer(self):
+        tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
-            tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
+        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
-            self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
+        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
-            self.assertListEqual(
+        self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
    def test_full_tokenizer_lower(self):
        tokenizer = TransfoXLTokenizer(lower_case=True)

--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -20,12 +20,16 @@ import json
 from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
-class XLMTokenizationTest(unittest.TestCase):
+class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
-    def test_full_tokenizer(self):
+    tokenizer_class = XLMTokenizer
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+    def setUp(self):
+        super(XLMTokenizationTest, self).setUp()
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                 "w</w>", "r</w>", "t</w>",
                 "lo", "low", "er</w>",
@@ -33,31 +37,46 @@ class XLMTokenizationTest(unittest.TestCase):
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
-        with TemporaryDirectory() as tmpdirname:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
-            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
-            with open(vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
-                fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
-            with open(merges_file, "w") as fp:
+            fp.write("\n".join(merges))
-                fp.write("\n".join(merges))
+    def get_tokenizer(self):
+        return XLMTokenizer.from_pretrained(self.tmpdirname)
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u"lower newer"
+        return input_text, output_text
+    def test_full_tokenizer(self):
+        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
-            input_text = u"lower newer"
+        text = "lower"
-            output_text = u"lower newer"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
-            create_and_check_tokenizer_commons(self, input_text, output_text, XLMTokenizer, tmpdirname)
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-            tokenizer = XLMTokenizer(vocab_file, merges_file)
+    def test_sequence_builders(self):
+        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
-            text = "lower"
+        text = tokenizer.encode("sequence builders")
-            bpe_tokens = ["low", "er</w>"]
+        text_2 = tokenizer.encode("multi-sequence build")
-            tokens = tokenizer.tokenize(text)
-            self.assertListEqual(tokens, bpe_tokens)
-            input_tokens = tokens + ["<unk>"]
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
-            input_bpe_tokens = [14, 15, 20]
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
-            self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        assert encoded_sentence == [1] + text + [1]
+        assert encoded_pair == [1] + text + [1] + text_2 + [1]
 if __name__ == '__main__':
    unittest.main()
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -19,48 +19,58 @@ import unittest
 from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                    'fixtures/test_sentencepiece.model')
-class XLNetTokenizationTest(unittest.TestCase):
+class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
-    def test_full_tokenizer(self):
+    tokenizer_class = XLNetTokenizer
+    def setUp(self):
+        super(XLNetTokenizationTest, self).setUp()
+        # We have a SentencePiece fixture for testing
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
-        with TemporaryDirectory() as tmpdirname:
+    def get_tokenizer(self):
-            tokenizer.save_pretrained(tmpdirname)
+        return XLNetTokenizer.from_pretrained(self.tmpdirname)
-            input_text = u"This is a test"
+    def get_input_output_texts(self):
-            output_text = u"This is a test"
+        input_text = u"This is a test"
+        output_text = u"This is a test"
+        return input_text, output_text
-            create_and_check_tokenizer_commons(self, input_text, output_text, XLNetTokenizer, tmpdirname)
-            tokens = tokenizer.tokenize(u'This is a test')
+    def test_full_tokenizer(self):
-            self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-            self.assertListEqual(
+        tokens = tokenizer.tokenize(u'This is a test')
-                tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
-            tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(
-            self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
-                                        u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                        u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
-            ids = tokenizer.convert_tokens_to_ids(tokens)
-            self.assertListEqual(
-                ids, [8, 21, 84, 55, 24, 19, 7, 0,
-                    602, 347, 347, 347, 3, 12, 66,
-                    46, 72, 80, 6, 0, 4])
-            back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-            self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                            u'or', u'n', SPIECE_UNDERLINE + u'in',
+                                    u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                            SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
+                                    u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                            SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                    SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
-                                            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
+        ids = tokenizer.convert_tokens_to_ids(tokens)
-                                            u'<unk>', u'.'])
+        self.assertListEqual(
+            ids, [8, 21, 84, 55, 24, 19, 7, 0,
+                602, 347, 347, 347, 3, 12, 66,
+                46, 72, 80, 6, 0, 4])
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                        u'or', u'n', SPIECE_UNDERLINE + u'in',
+                                        SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
+                                        SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
+                                        u'<unk>', u'.'])
    def test_tokenizer_lower(self):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
@@ -79,6 +89,18 @@ class XLNetTokenizationTest(unittest.TestCase):
                                      u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                                      SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
+    def test_sequence_builders(self):
+        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+        assert encoded_sentence == text + [4, 3]
+        assert encoded_pair == text + [4] + text_2 + [4, 3]
 if __name__ == '__main__':
    unittest.main()
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+from __future__ import absolute_import, division, print_function, unicode_literals
+import logging
+from .tokenization_bert import BertTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_transfo_xl import TransfoXLTokenizer
+from .tokenization_xlnet import XLNetTokenizer
+from .tokenization_xlm import XLMTokenizer
+from .tokenization_roberta import RobertaTokenizer
+logger = logging.getLogger(__name__)
+class AutoTokenizer(object):
+    r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class
+        that will be instantiated as one of the tokenizer classes of the library
+        when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+        The `from_pretrained()` method take care of returning the correct tokenizer class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+        The tokenizer class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertTokenizer (Bert model)
+            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
+            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
+            - contains `xlnet`: XLNetTokenizer (XLNet model)
+            - contains `xlm`: XLMTokenizer (XLM model)
+            - contains `roberta`: RobertaTokenizer (RoBERTa model)
+        This class cannot be instantiated using `__init__()` (throw an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoTokenizer is designed to be instantiated "
+            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        r""" Instantiate a one of the tokenizer classes of the library
+        from a pre-trained model vocabulary.
+        The tokenizer class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertTokenizer (Bert model)
+            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
+            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
+            - contains `xlnet`: XLNetTokenizer (XLNet model)
+            - contains `xlm`: XLMTokenizer (XLM model)
+            - contains `roberta`: RobertaTokenizer (XLM model)
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a saved configuration `file`.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+        Examples::
+            config = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
+            config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+        """
+        if 'roberta' in pretrained_model_name_or_path:
+            return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -22,7 +22,7 @@ import os
 import unicodedata
 from io import open
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 logger = logging.getLogger(__name__)
@@ -67,10 +67,10 @@ def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.read().splitlines()
+        tokens = reader.readlines()
    for index, token in enumerate(tokens):
+        token = token.rstrip('\n')
        vocab[token] = index
-        index += 1
    return vocab
@@ -86,7 +86,7 @@ def whitespace_tokenize(text):
 class BertTokenizer(PreTrainedTokenizer):
    r"""
    Constructs a BertTokenizer.
-    :class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
+    :class:`~pytorch_transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
    Args:
        vocab_file: Path to a one-wordpiece-per-line vocabulary file
@@ -119,12 +119,15 @@ class BertTokenizer(PreTrainedTokenizer):
                Only has an effect when do_basic_tokenize=True
            **tokenize_chinese_chars**: (`optional`) boolean (default True)
                Whether to tokenize Chinese characters.
-                This should likely be desactivated for Japanese:
+                This should likely be deactivated for Japanese:
                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
        """
        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
                                            pad_token=pad_token, cls_token=cls_token,
                                            mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
@@ -166,11 +169,29 @@ class BertTokenizer(PreTrainedTokenizer):
        out_string = ' '.join(tokens).replace(' ##', '').strip()
        return out_string
+    def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to the a sequence for sequence classification tasks.
+        A BERT sequence has the following format: [CLS] X [SEP]
+        """
+        return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
+        """
+        sep = [self._convert_token_to_id(self.sep_token)]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
    def save_vocabulary(self, vocab_path):
        """Save the tokenizer vocabulary to a directory or file."""
        index = 0
        if os.path.isdir(vocab_path):
            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        else:
+            vocab_file = vocab_path
        with open(vocab_file, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                if index != token_index:
@@ -214,7 +235,7 @@ class BasicTokenizer(object):
                List of token not to split.
            **tokenize_chinese_chars**: (`optional`) boolean (default True)
                Whether to tokenize Chinese characters.
-                This should likely be desactivated for Japanese:
+                This should likely be deactivated for Japanese:
                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
        """
        if never_split is None:

--- a/pytorch_transformers/tokenization_distilbert.py
+++ b/pytorch_transformers/tokenization_distilbert.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for DistilBERT."""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+from .tokenization_bert import BertTokenizer
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'distilbert-base-uncased': 512,
+    'distilbert-base-uncased-distilled-squad': 512,
+}
+class DistilBertTokenizer(BertTokenizer):
+    r"""
+    Constructs a DistilBertTokenizer.
+    :class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
+    Args:
+        vocab_file: Path to a one-wordpiece-per-line vocabulary file
+        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
+            minimum of this value (if specified) and the underlying BERT model's sequence length.
+        never_split: List of tokens which will never be split during tokenization. Only has an effect when
+            do_wordpiece_only=False
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -31,7 +31,7 @@ except ImportError:
    def lru_cache():
        return lambda func: func
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 logger = logging.getLogger(__name__)
@@ -45,17 +45,20 @@ PRETRAINED_VOCAB_FILES_MAP = {
    {
        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
    },
    'merges_file':
    {
        'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
        'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
    },
 }
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'gpt2': 1024,
    'gpt2-medium': 1024,
+    'gpt2-large': 1024,
 }
 @lru_cache()
@@ -102,9 +105,11 @@ class GPT2Tokenizer(PreTrainedTokenizer):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    def __init__(self, vocab_file, merges_file, errors='replace',
+    def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
                 bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
-        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, **kwargs)
+        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
+        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
        self.encoder = json.load(open(vocab_file))
        self.decoder = {v:k for k,v in self.encoder.items()}
@@ -177,9 +182,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
    def _convert_token_to_id(self, token):
        """ Converts a token (str/unicode) in an id using the vocab. """
-        if token in self.encoder:
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
-            return self.encoder.get(token)
-        return self.encoder.get(self.unk_token)
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (string/unicode) using the vocab."""

--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
@@ -87,10 +87,14 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
        super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs)
+        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
        try:
            import ftfy
-            import spacy
+            from spacy.lang.en import English
-            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
+            _nlp = English()
+            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")

--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RoBERTa."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import sys
+import json
+import logging
+import os
+import regex as re
+from io import open
+from .tokenization_gpt2 import bytes_to_unicode, get_pairs
+from .tokenization_utils import PreTrainedTokenizer
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
+}
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
+        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
+        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
+    },
+    'merges_file':
+    {
+        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
+        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
+        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
+    },
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'roberta-base': 512,
+    'roberta-large': 512,
+    'roberta-large-mnli': 512,
+}
+class RobertaTokenizer(PreTrainedTokenizer):
+    """
+    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
+                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
+        super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
+                                               sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
+                                               mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def _tokenize(self, text):
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.decoder.get(index)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        text = ''.join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
+    def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to a sequence for sequence classification tasks.
+        A RoBERTa sequence has the following format: <s> X </s>
+        """
+        return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
+        """
+        sep = [self._convert_token_to_id(self.sep_token)]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+    def save_vocabulary(self, save_directory):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
+        return vocab_file, merge_file
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -30,7 +30,7 @@ import torch
 import numpy as np
 from .file_utils import cached_path
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 if sys.version_info[0] == 2:
    import cPickle as pickle
@@ -73,6 +73,10 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
        super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token,
                                                 additional_special_tokens=additional_special_tokens,
                                                 **kwargs)
+        self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens
+        self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens
        if never_split is None:
            never_split = self.all_special_tokens
        if special is None:

--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -30,14 +30,34 @@ SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
 ADDED_TOKENS_FILE = 'added_tokens.json'
 class PreTrainedTokenizer(object):
-    """ An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary.
+    """ Base class for all tokenizers.
+    Handle all the shared methods for tokenization and special tokens as well as methods dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
-        Derived class can set up a few special tokens to be used in common scripts and internals:
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
-            bos_token, eos_token, EOP_TOKEN, EOD_TOKEN, unk_token, sep_token, pad_token, cls_token, mask_token
-            additional_special_tokens = []
-        We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
+    Class attributes (overridden by derived classes):
-            specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
+        - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
+        - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
+        - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
+    Parameters:
+        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token``
+        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token``
+        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token``
+        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token``
+        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token``
+        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token``
+        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token``
+        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens``
    """
    vocab_files_names = {}
    pretrained_vocab_files_map = {}
@@ -49,48 +69,56 @@ class PreTrainedTokenizer(object):
    @property
    def bos_token(self):
+        """ Beginning of sentence token (string). Log an error if used while not having been set. """
        if self._bos_token is None:
            logger.error("Using bos_token, but it is not set yet.")
        return self._bos_token
    @property
    def eos_token(self):
+        """ End of sentence token (string). Log an error if used while not having been set. """
        if self._eos_token is None:
            logger.error("Using eos_token, but it is not set yet.")
        return self._eos_token
    @property
    def unk_token(self):
+        """ Unknown token (string). Log an error if used while not having been set. """
        if self._unk_token is None:
            logger.error("Using unk_token, but it is not set yet.")
        return self._unk_token
    @property
    def sep_token(self):
+        """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
        if self._sep_token is None:
            logger.error("Using sep_token, but it is not set yet.")
        return self._sep_token
    @property
    def pad_token(self):
+        """ Padding token (string). Log an error if used while not having been set. """
        if self._pad_token is None:
            logger.error("Using pad_token, but it is not set yet.")
        return self._pad_token
    @property
    def cls_token(self):
+        """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
        if self._cls_token is None:
            logger.error("Using cls_token, but it is not set yet.")
        return self._cls_token
    @property
    def mask_token(self):
+        """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
        if self._mask_token is None:
            logger.error("Using mask_token, but it is not set yet.")
        return self._mask_token
    @property
    def additional_special_tokens(self):
+        """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
        if self._additional_special_tokens is None:
            logger.error("Using additional_special_tokens, but it is not set yet.")
        return self._additional_special_tokens
@@ -138,48 +166,119 @@ class PreTrainedTokenizer(object):
        self._additional_special_tokens = []
        self.max_len = max_len if max_len is not None else int(1e12)
+        self.max_len_single_sentence = self.max_len
+        self.max_len_sentences_pair = self.max_len
        self.added_tokens_encoder = {}
        self.added_tokens_decoder = {}
        for key, value in kwargs.items():
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
+                if key == 'additional_special_tokens':
+                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
+                else:
+                    assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
                setattr(self, key, value)
    @classmethod
    def from_pretrained(cls, *inputs, **kwargs):
+        r"""
+        Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+        Args:
+            pretrained_model_name_or_path: either:
+                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+        Examples::
+            # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
+            # Download vocabulary from S3 and cache.
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+            # If the tokenizer uses a single vocabulary file, you can point directly to this file
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+            # You can link tokens to special vocabulary when instantiating
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+            # You should be sure '<unk>' is in the vocabulary when doing that.
+            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
+            assert tokenizer.unk_token == '<unk>'
+        """
        return cls._from_pretrained(*inputs, **kwargs)
    @classmethod
-    def _from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+    def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
+        cache_dir = kwargs.pop('cache_dir', None)
-        Instantiate a PreTrainedTokenizer from pre-trained vocabulary files.
+        force_download = kwargs.pop('force_download', False)
-        Download and cache the vocabulary files if needed.
+        proxies = kwargs.pop('proxies', None)
-        """
        s3_models = list(cls.max_model_input_sizes.keys())
        vocab_files = {}
        if pretrained_model_name_or_path in s3_models:
+            # Get the vocabulary from AWS S3 bucket
            for file_id, map_list in cls.pretrained_vocab_files_map.items():
                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
        else:
+            # Get the vocabulary from local files
            logger.info(
                "Model name '{}' not found in model shortcut name list ({}). "
                "Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
                    pretrained_model_name_or_path, ', '.join(s3_models),
                    pretrained_model_name_or_path))
-            all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
-                                     'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
+            # Look for the tokenizer main vocabulary files
-            all_vocab_files_names.update(cls.vocab_files_names)
+            for file_id, file_name in cls.vocab_files_names.items():
-            for file_id, file_name in all_vocab_files_names.items():
                if os.path.isdir(pretrained_model_name_or_path):
+                    # If a directory is provided we look for the standard filenames
                    full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
                else:
+                    # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
                    full_file_name = pretrained_model_name_or_path
                if not os.path.exists(full_file_name):
                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
                    full_file_name = None
                vocab_files[file_id] = full_file_name
+            # Look for the additional tokens files
+            all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
+                                     'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
+            # If a path to a file was provided, get the parent directory
+            saved_directory = pretrained_model_name_or_path
+            if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
+                saved_directory = os.path.dirname(saved_directory)
+            for file_id, file_name in all_vocab_files_names.items():
+                full_file_name = os.path.join(saved_directory, file_name)
+                if not os.path.exists(full_file_name):
+                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                    full_file_name = None
+                vocab_files[file_id] = full_file_name
            if all(full_file_name is None for full_file_name in vocab_files.values()):
                logger.error(
                    "Model name '{}' was not found in model name list ({}). "
@@ -196,8 +295,8 @@ class PreTrainedTokenizer(object):
                if file_path is None:
                    resolved_vocab_files[file_id] = None
                else:
-                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir)
+                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
-        except EnvironmentError:
+        except EnvironmentError as e:
            if pretrained_model_name_or_path in s3_models:
                logger.error("Couldn't reach server to download vocabulary.")
            else:
@@ -207,7 +306,7 @@ class PreTrainedTokenizer(object):
                    "at this path or url.".format(
                        pretrained_model_name_or_path, ', '.join(s3_models),
                        pretrained_model_name_or_path, str(vocab_files.keys())))
-            return None
+            raise e
        for file_id, file_path in vocab_files.items():
            if file_path == resolved_vocab_files[file_id]:
@@ -251,8 +350,9 @@ class PreTrainedTokenizer(object):
    def save_pretrained(self, save_directory):
        """ Save the tokenizer vocabulary files (with added tokens) and the
-            special-tokens-to-class-attributes-mapping to a directory, so that it
+            special-tokens-to-class-attributes-mapping to a directory.
-            can be re-loaded using the `from_pretrained(save_directory)` class method.
+            This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
        """
        if not os.path.isdir(save_directory):
            logger.error("Saving directory ({}) should be a directory".format(save_directory))
@@ -266,7 +366,7 @@ class PreTrainedTokenizer(object):
        with open(added_tokens_file, 'w', encoding='utf-8') as f:
            if self.added_tokens_encoder:
-                out_str = json.dumps(self.added_tokens_decoder, ensure_ascii=False)
+                out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
            else:
                out_str = u"{}"
            f.write(out_str)
@@ -277,38 +377,53 @@ class PreTrainedTokenizer(object):
    def save_vocabulary(self, save_directory):
-        """ Save the tokenizer vocabulary to a directory. This method doesn't save added tokens
+        """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
            and special token mappings.
-            Please use `save_pretrained()` to save the full Tokenizer state so that it can be
+            Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
-            reloaded using the `from_pretrained(save_directory)` class method.
        """
        raise NotImplementedError
    def vocab_size(self):
+        """ Size of the base vocabulary (without the added tokens) """
        raise NotImplementedError
    def __len__(self):
+        """ Size of the full vocabulary with the added tokens """
        return self.vocab_size + len(self.added_tokens_encoder)
    def add_tokens(self, new_tokens):
-        """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+        """
-            vocabulary, they are added to the added_tokens_encoder with indices starting from
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
-            the last index of the current vocabulary.
+        vocabulary, they are added to it with indices starting from length of the current vocabulary.
+        Args:
+            new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+        Returns:
+            Number of tokens added to the vocabulary.
+        Examples::
+            # Let's see how to increase the vocabulary of Bert model and tokenizer
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = BertModel.from_pretrained('bert-base-uncased')
-            Returns:
+            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-                Number of tokens added to the vocabulary which can be used to correspondingly
+            print('We have added', num_added_toks, 'tokens')
-                    increase the size of the associated model embedding matrices.
+            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
        """
        if not new_tokens:
            return 0
        to_add_tokens = []
        for token in new_tokens:
-            if self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
+            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
+            if token != self.unk_token and \
+                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
                to_add_tokens.append(token)
                logger.info("Adding %s to the vocabulary", token)
@@ -321,24 +436,51 @@ class PreTrainedTokenizer(object):
    def add_special_tokens(self, special_tokens_dict):
-        """ Add a dictionnary of special tokens (eos, pad, cls...) to the encoder and link them
+        """
-            to class attributes. If the special tokens are not in the vocabulary, they are added
+        Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
-            to it and indexed starting from the last index of the current vocabulary.
+        to class attributes. If special tokens are NOT in the vocabulary, they are added
+        to it (indexed starting from the last index of the current vocabulary).
+        Args:
+            special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
+                [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
+                ``additional_special_tokens``].
+                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+        Returns:
+            Number of tokens added to the vocabulary.
+        Examples::
-            Returns:
+            # Let's see how to add a new classification token to GPT-2
-                Number of tokens added to the vocabulary which can be used to correspondingly
+            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-                    increase the size of the associated model embedding matrices.
+            model = GPT2Model.from_pretrained('gpt2')
+            special_tokens_dict = {'cls_token': '<CLS>'}
+            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+            print('We have added', num_added_toks, 'tokens')
+            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+            assert tokenizer.cls_token == '<CLS>'
        """
        if not special_tokens_dict:
            return 0
-        added_special_tokens = self.add_tokens(special_tokens_dict.values())
+        added_tokens = 0
        for key, value in special_tokens_dict.items():
+            assert key in self.SPECIAL_TOKENS_ATTRIBUTES
+            if key == 'additional_special_tokens':
+                assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
+                added_tokens += self.add_tokens(value)
+            else:
+                assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
+                added_tokens += self.add_tokens([value])
            logger.info("Assigning %s to the %s key of the tokenizer", value, key)
            setattr(self, key, value)
-        return added_special_tokens
+        return added_tokens
    def tokenize(self, text, **kwargs):
        """ Converts a string in a sequence of tokens (string), using the tokenizer.
@@ -347,15 +489,45 @@ class PreTrainedTokenizer(object):
            Take care of added tokens.
        """
+        def split_on_token(tok, text):
+            result = []
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                sub_text = sub_text.strip()
+                if i == 0 and not sub_text:
+                    result += [tok]
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result += [sub_text]
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result += [sub_text]
+                    result += [tok]
+            return result
        def split_on_tokens(tok_list, text):
            if not text:
                return []
            if not tok_list:
                return self._tokenize(text, **kwargs)
-            tok = tok_list[0]
-            split_text = text.split(tok)
+            tokenized_text = []
-            return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \
+            text_list = [text]
-                        for sub_text in split_text), [])[:-1]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self.added_tokens_encoder \
+                            and sub_text not in self.all_special_tokens:
+                        tokenized_text += split_on_token(tok, sub_text)
+                    else:
+                        tokenized_text += [sub_text]
+                text_list = tokenized_text
+            return sum((self._tokenize(token, **kwargs) if token not \
+                    in self.added_tokens_encoder and token not in self.all_special_tokens \
+                    else [token] for token in tokenized_text), [])
        added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
        tokenized_text = split_on_tokens(added_tokens, text)
@@ -366,13 +538,13 @@ class PreTrainedTokenizer(object):
            Split in words for word-based vocabulary or sub-words for sub-word-based
            vocabularies (BPE/SentencePieces/WordPieces).
-            Don't take care of added tokens.
+            Do NOT take care of added tokens.
        """
        raise NotImplementedError
    def convert_tokens_to_ids(self, tokens):
-        """ Converts a single token or a sequence of tokens (str/unicode) in a integer id
+        """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
-            (resp.) a sequence of ids, using the vocabulary.
+            (resp. a sequence of ids), using the vocabulary.
        """
        if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
            return self._convert_token_to_id_with_added_voc(tokens)
@@ -394,13 +566,39 @@ class PreTrainedTokenizer(object):
    def _convert_token_to_id(self, token):
        raise NotImplementedError
+    def encode(self, text, text_pair=None, add_special_tokens=False):
-    def encode(self, text):
+        """
-        """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
+        Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-            same as self.convert_tokens_to_ids(self.tokenize(text)).
+        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
+        Args:
+            text: The first sequence to be encoded.
+            text_pair: Optional second sequence to be encoded.
+            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
        """
-        return self.convert_tokens_to_ids(self.tokenize(text))
+        if text_pair is None:
+            if add_special_tokens:
+                return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
+            else:
+                return self.convert_tokens_to_ids(self.tokenize(text))
+        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
+        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)]
+        if add_special_tokens:
+            return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
+        else:
+            return first_sentence_tokens, second_sentence_tokens
+    def add_special_tokens_single_sentence(self, token_ids):
+        logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
+        return token_ids
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
+        return token_ids_0 + token_ids_1
    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
        """ Converts a single index or a sequence of indices (integers) in a token "
@@ -435,14 +633,28 @@ class PreTrainedTokenizer(object):
        return ' '.join(self.convert_ids_to_tokens(tokens))
    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+        """
-            with options to remove special tokens and clean up tokenization spaces.
+        Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+        with options to remove special tokens and clean up tokenization spaces.
+        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
        """
        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
        text = self.convert_tokens_to_string(filtered_tokens)
-        if clean_up_tokenization_spaces:
-            text = clean_up_tokenization(text)
+        if self._sep_token is not None and self._sep_token in text:
-        return text
+            text = text.replace(self._cls_token, self._sep_token)
+            split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self._sep_token)))
+            if clean_up_tokenization_spaces:
+                clean_text = [self.clean_up_tokenization(text) for text in split_text]
+                return clean_text
+            else:
+                return split_text
+        else:
+            if clean_up_tokenization_spaces:
+                clean_text = self.clean_up_tokenization(text)
+                return clean_text
+            else:
+                return text
    @property
    def special_tokens_map(self):
@@ -474,13 +686,14 @@ class PreTrainedTokenizer(object):
            class attributes (cls_token, unk_token...).
        """
        all_toks = self.all_special_tokens
-        all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks)
+        all_ids = list(self._convert_token_to_id(t) for t in all_toks)
        return all_ids
+    @staticmethod
+    def clean_up_tokenization(out_string):
-def clean_up_tokenization(out_string):
+        """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
-    out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+        """
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+        out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+                        ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-    return out_string
+                        ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+        return out_string
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -122,10 +122,15 @@ class XLMTokenizer(PreTrainedTokenizer):
                                           cls_token=cls_token, mask_token=mask_token,
                                           additional_special_tokens=additional_special_tokens,
                                           **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
        try:
            import ftfy
-            import spacy
+            from spacy.lang.en import English
-            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
+            _nlp = English()
+            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
            self.fix_text = ftfy.fix_text
        except ImportError:
            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
@@ -214,6 +219,22 @@ class XLMTokenizer(PreTrainedTokenizer):
        out_string = ''.join(tokens).replace('</w>', ' ').strip()
        return out_string
+    def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to a sequence for sequence classification tasks.
+        An XLM sequence has the following format: [CLS] X [SEP]
+        """
+        return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
+        """
+        sep = [self._convert_token_to_id(self.sep_token)]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
    def save_vocabulary(self, save_directory):
        """Save the tokenizer vocabulary and merge files to a directory."""
        if not os.path.isdir(save_directory):

--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -23,7 +23,7 @@ from shutil import copyfile
 import unicodedata
 import six
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 logger = logging.getLogger(__name__)
@@ -71,6 +71,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
                                             pad_token=pad_token, cls_token=cls_token,
                                             mask_token=mask_token, additional_special_tokens=
                                             additional_special_tokens, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
        try:
            import sentencepiece as spm
        except ImportError:
@@ -177,6 +181,24 @@ class XLNetTokenizer(PreTrainedTokenizer):
        out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
        return out_string
+    def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
+        """
+        sep = [self._convert_token_to_id(self.sep_token)]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        return token_ids + sep + cls
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence for sequence classification tasks.
+        An XLNet sequence has the following format: X [SEP][CLS]
+        """
+        sep = [self._convert_token_to_id(self.sep_token)]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        return token_ids_0 + sep + token_ids_1 + sep + cls
    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.

--- a/requirements.txt
+++ b/requirements.txt
 # PyTorch
-torch>=0.4.1
+torch>=1.0.0
 # progress bars in model download and training scripts
 tqdm
 # Accessing files from S3 directly.

--- a/setup.py
+++ b/setup.py
@@ -38,10 +38,10 @@ from setuptools import find_packages, setup
 setup(
    name="pytorch_transformers",
-    version="1.0.0",
+    version="1.1.0",
-    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
+    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
    author_email="thomas@huggingface.co",
-    description="Repository of pre-trained NLP Transformer models: BERT, GPT & GPT-2, Transformer-XL, XLNet and XLM",
+    description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
    long_description=open("README.md", "r", encoding='utf-8').read(),
    long_description_content_type="text/markdown",
    keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',
@@ -49,7 +49,7 @@ setup(
    url="https://github.com/huggingface/pytorch-transformers",
    packages=find_packages(exclude=["*.tests", "*.tests.*",
                                    "tests.*", "tests"]),
-    install_requires=['torch>=0.4.1',
+    install_requires=['torch>=1.0.0',
                      'numpy',
                      'boto3',
                      'requests',