Merge branch 'master' into t5

110394b2 · Thomas Wolf · GitHub · 33e72b08 · 7296f101 · 110394b2
Unverified Commit 110394b2 authored Dec 13, 2019 by Thomas Wolf Committed by GitHub Dec 13, 2019
8 changed files
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -23,7 +23,7 @@ import logging
 from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
 from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .utils import slow
+from .utils import slow, SMALL_MODEL_IDENTIFIER
 class AutoTokenizerTest(unittest.TestCase):
@@ -42,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase):
            self.assertIsInstance(tokenizer, GPT2Tokenizer)
            self.assertGreater(len(tokenizer), 0)
+    def test_tokenizer_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(tokenizer, BertTokenizer)
+        self.assertEqual(len(tokenizer), 12)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/tokenization_bert_japanese_test.py
+++ b/transformers/tests/tokenization_bert_japanese_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os
+import unittest
+from io import open
+from transformers.tokenization_bert import WordpieceTokenizer
+from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
+                                                     MecabTokenizer, CharacterTokenizer,
+                                                     VOCAB_FILES_NAMES)
+from .tokenization_tests_commons import CommonTestCases
+from .utils import slow, custom_tokenizers
+@custom_tokenizers
+class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
+    tokenizer_class = BertJapaneseTokenizer
+    def setUp(self):
+        super(BertJapaneseTokenizationTest, self).setUp()
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは",
+            u"世界", u"##世界", u"、", u"##、", u"。", u"##。"]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+    def get_tokenizer(self, **kwargs):
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+    def get_input_output_texts(self):
+        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
+        output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。"
+        return input_text, output_text
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+        tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。")
+        self.assertListEqual(tokens,
+                             [u"こんにちは", u"、", u"世界", u"。",
+                              u"こん", u"##ばんは", u"、", u"世界", "。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
+                             [3, 12, 10, 14, 4, 9, 12, 10, 14])
+    def test_mecab_tokenizer(self):
+        tokenizer = MecabTokenizer()
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"アップルストア", u"で", u"iPhone", u"8", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"。"])
+    def test_mecab_tokenizer_lower(self):
+        tokenizer = MecabTokenizer(do_lower_case=True)
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"アップルストア", u"で", u"iphone", u"8", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"。"])
+    def test_mecab_tokenizer_no_normalize(self):
+        tokenizer = MecabTokenizer(normalize_text=False)
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"ｱｯﾌﾟﾙストア", u"で", u"iPhone", u"８", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"　", u"。"])
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"]
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]")
+        self.assertListEqual(tokenizer.tokenize(u""), [])
+        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
+                             [u"こんにちは"])
+        self.assertListEqual(tokenizer.tokenize(u"こんばんは"),
+                             [u"こん", u"##ばんは"])
+        self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
+                             [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
+        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+        # 2 is for "[CLS]", 3 is for "[SEP]"
+        assert encoded_sentence == [2] + text + [3]
+        assert encoded_pair == [2] + text + [3] + text_2 + [3]
+class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester):
+    tokenizer_class = BertJapaneseTokenizer
+    def setUp(self):
+        super(BertJapaneseCharacterTokenizationTest, self).setUp()
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+    def get_tokenizer(self, **kwargs):
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname,
+                                                     subword_tokenizer_type="character",
+                                                     **kwargs)
+    def get_input_output_texts(self):
+        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
+        output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
+        return input_text, output_text
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file,
+                                         subword_tokenizer_type="character")
+        tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。")
+        self.assertListEqual(tokens,
+            [u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。",
+             u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
+                             [3, 4, 5, 6, 7, 11, 9, 10, 12,
+                              3, 4, 8, 4, 7, 11, 9, 10, 12])
+    def test_character_tokenizer(self):
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"]
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]")
+        self.assertListEqual(tokenizer.tokenize(u""), [])
+        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
+                             [u"こ", u"ん", u"に", u"ち", u"は"])
+        self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
+                             [u"こ", u"ん", u"に", u"ち", u"[UNK]"])
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
+        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+        # 2 is for "[CLS]", 3 is for "[SEP]"
+        assert encoded_sentence == [2] + text + [3]
+        assert encoded_pair == [2] + text + [3] + text_2 + [3]
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -6,18 +6,26 @@ from distutils.util import strtobool
 from transformers.file_utils import _tf_available, _torch_available
-try:
+SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
-    run_slow = os.environ["RUN_SLOW"]
-except KeyError:
-    # RUN_SLOW isn't set, default to skipping slow tests.
+def parse_flag_from_env(key, default=False):
-    _run_slow_tests = False
-else:
-    # RUN_SLOW is set, convert it to True or False.
    try:
-        _run_slow_tests = strtobool(run_slow)
+        value = os.environ[key]
-    except ValueError:
+    except KeyError:
-        # More values are supported, but let's keep the message simple.
+        # KEY isn't set, default to `default`.
-        raise ValueError("If set, RUN_SLOW must be yes or no.")
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = strtobool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError("If set, {} must be yes or no.".format(key))
+    return _value
+_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
+_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
 def slow(test_case):
@@ -33,6 +41,19 @@ def slow(test_case):
    return test_case
+def custom_tokenizers(test_case):
+    """
+    Decorator marking a test for a custom tokenizer.
+    Custom tokenizers require additional dependencies, and are skipped
+    by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
+    to a truthy value to run them.
+    """
+    if not _run_custom_tokenizers:
+        test_case = unittest.skip("test of custom tokenizers")(test_case)
+    return test_case
 def require_torch(test_case):
    """
    Decorator marking a test that requires PyTorch.
@@ -59,6 +80,6 @@ def require_tf(test_case):
 if _torch_available:
    # Set the USE_CUDA environment variable to select a GPU.
-    torch_device = "cuda" if os.environ.get("USE_CUDA") else "cpu"
+    torch_device = "cuda" if parse_flag_from_env("USE_CUDA") else "cpu"
 else:
    torch_device = None
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
 from .tokenization_bert import BertTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_ctrl import CTRLTokenizer
@@ -75,6 +76,7 @@ class AutoTokenizer(object):
            - contains `albert`: AlbertTokenizer (ALBERT model)
            - contains `camembert`: CamembertTokenizer (CamemBERT model)
            - contains `roberta`: RobertaTokenizer (RoBERTa model)
+            - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
            - contains `bert`: BertTokenizer (Bert model)
            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
@@ -87,6 +89,7 @@ class AutoTokenizer(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
@@ -109,8 +112,14 @@ class AutoTokenizer(object):
        Examples::
-            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
+            # Download vocabulary from S3 and cache.
-            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+            # Download vocabulary from S3 (user-uploaded) and cache.
+            tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
        """
        if 't5' in pretrained_model_name_or_path:
@@ -123,6 +132,8 @@ class AutoTokenizer(object):
            return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'bert-base-japanese' in pretrained_model_name_or_path:
+            return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        elif 'openai-gpt' in pretrained_model_name_or_path:

--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import collections
+import logging
+import os
+import six
+import unicodedata
+from io import open
+from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab
+from .tokenization_utils import PreTrainedTokenizer
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
+        'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
+        'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
+        'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'bert-base-japanese': 512,
+    'bert-base-japanese-whole-word-masking': 512,
+    'bert-base-japanese-char': 512,
+    'bert-base-japanese-char-whole-word-masking': 512
+}
+PRETRAINED_INIT_CONFIGURATION = {
+    'bert-base-japanese': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'wordpiece'
+    },
+    'bert-base-japanese-whole-word-masking':{
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'wordpiece'
+    },
+    'bert-base-japanese-char': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'character'
+    },
+    'bert-base-japanese-char-whole-word-masking': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'character'
+    }
+}
+class BertJapaneseTokenizer(BertTokenizer):
+    """BERT tokenizer for Japanese text"""
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    def __init__(self, vocab_file, do_lower_case=False,
+                 do_word_tokenize=True, do_subword_tokenize=True,
+                 word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
+                 never_split=None, unk_token='[UNK]', sep_token='[SEP]',
+                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
+        """Constructs a MecabBertTokenizer.
+        Args:
+            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input.
+                Only has an effect when do_basic_tokenize=True.
+            **do_word_tokenize**: (`optional`) boolean (default True)
+                Whether to do word tokenization.
+            **do_subword_tokenize**: (`optional`) boolean (default True)
+                Whether to do subword tokenization.
+            **word_tokenizer_type**: (`optional`) string (default "basic")
+                Type of word tokenizer.
+            **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
+                Type of subword tokenizer.
+        """
+        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+                                            pad_token=pad_token, cls_token=cls_token,
+                                            mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_word_tokenize = do_word_tokenize
+        if do_word_tokenize:
+            if word_tokenizer_type == 'basic':
+                self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                     never_split=never_split,
+                                                     tokenize_chinese_chars=False)
+            elif word_tokenizer_type == 'mecab':
+                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
+                                                     never_split=never_split)
+            else:
+                raise ValueError(
+                    "Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
+        self.do_subword_tokenize = do_subword_tokenize
+        if do_subword_tokenize:
+            if subword_tokenizer_type == 'wordpiece':
+                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
+                                                            unk_token=self.unk_token)
+            elif subword_tokenizer_type == 'character':
+                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
+                                                            unk_token=self.unk_token)
+            else:
+                raise ValueError(
+                    "Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
+    def _tokenize(self, text):
+        if self.do_word_tokenize:
+            tokens = self.word_tokenizer.tokenize(text,
+                                                  never_split=self.all_special_tokens)
+        else:
+            tokens = [text]
+        if self.do_subword_tokenize:
+            split_tokens = [sub_token for token in tokens
+                            for sub_token in self.subword_tokenizer.tokenize(token)]
+        else:
+            split_tokens = tokens
+        return split_tokens
+class MecabTokenizer(object):
+    """Runs basic tokenization with MeCab morphological parser."""
+    def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
+        """Constructs a MecabTokenizer.
+        Args:
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input.
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split if never_split is not None else []
+        self.normalize_text = normalize_text
+        import MeCab
+        self.mecab = MeCab.Tagger()
+    def tokenize(self, text, never_split=None, **kwargs):
+        """Tokenizes a piece of text."""
+        if self.normalize_text:
+            text = unicodedata.normalize('NFKC', text)
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        tokens = []
+        if six.PY2:
+            mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8')
+        else:
+            mecab_output = self.mecab.parse(text)
+        cursor = 0
+        for line in mecab_output.split('\n'):
+            if line == 'EOS':
+                break
+            token, _ = line.split('\t')
+            token_start = text.index(token, cursor)
+            token_end = token_start + len(token)
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+            tokens.append(token)
+            cursor = token_end
+        return tokens
+class CharacterTokenizer(object):
+    """Runs Character tokenziation."""
+    def __init__(self, vocab, unk_token, normalize_text=True):
+        """Constructs a CharacterTokenizer.
+        Args:
+            **vocab**:
+                Vocabulary object.
+            **unk_token**: str
+                A special symbol for out-of-vocabulary token.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.normalize_text = normalize_text
+    def tokenize(self, text):
+        """Tokenizes a piece of text into characters.
+        For example:
+            input = "apple"
+            output = ["a", "p", "p", "l", "e"]
+        Args:
+            text: A single token or whitespace separated tokens.
+                This should have already been passed through `BasicTokenizer`.
+        Returns:
+            A list of characters.
+        """
+        if self.normalize_text:
+            text = unicodedata.normalize('NFKC', text)
+        output_tokens = []
+        for i, char in enumerate(text):
+            if char not in self.vocab:
+                output_tokens.append(self.unk_token)
+                continue
+            output_tokens.append(char)
+        return output_tokens
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -25,7 +25,7 @@ import itertools
 import re
 from io import open
-from .file_utils import cached_path, is_tf_available, is_torch_available
+from .file_utils import cached_path, is_remote_url, hf_bucket_url, is_tf_available, is_torch_available
 if is_tf_available():
    import tensorflow as tf
@@ -226,7 +226,7 @@ class PreTrainedTokenizer(object):
        self.max_len = max_len if max_len is not None else int(1e12)
-        # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed.
+        # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
        self.padding_side = kwargs.pop('padding_side', self.padding_side)
        # Added tokens
@@ -255,6 +255,7 @@ class PreTrainedTokenizer(object):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
@@ -282,6 +283,9 @@ class PreTrainedTokenizer(object):
            # Download vocabulary from S3 and cache.
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            # Download vocabulary from S3 (user-uploaded) and cache.
+            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
@@ -327,12 +331,15 @@ class PreTrainedTokenizer(object):
                if os.path.isdir(pretrained_model_name_or_path):
                    # If a directory is provided we look for the standard filenames
                    full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
-                else:
+                    if not os.path.exists(full_file_name):
+                        logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                        full_file_name = None
+                elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                    # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
                    full_file_name = pretrained_model_name_or_path
-                if not os.path.exists(full_file_name):
+                else:
-                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                    full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
-                    full_file_name = None
                vocab_files[file_id] = full_file_name
            # Look for the additional tokens files
@@ -628,7 +635,6 @@ class PreTrainedTokenizer(object):
            Take care of added tokens.
            text: The sequence to be encoded.
-            return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
            **kwargs: passed to the child `self.tokenize()` method
        """
        def lowercase_text(t):
@@ -663,7 +669,7 @@ class PreTrainedTokenizer(object):
            return result
        def split_on_tokens(tok_list, text):
-            if not text:
+            if not text.strip():
                return []
            if not tok_list:
                return self._tokenize(text, **kwargs)
@@ -917,7 +923,7 @@ class PreTrainedTokenizer(object):
            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                or PyTorch torch.Tensor instead of a list of python integers.
            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoir returning attention mask (default True)
+            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
@@ -962,24 +968,13 @@ class PreTrainedTokenizer(object):
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            special_tokens_mask = self.get_special_tokens_mask(ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
            token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
-            special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0))
        if return_special_tokens_mask:
            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
-        # Prepare inputs as tensors if asked
-        if return_tensors == 'tf' and is_tf_available():
-            sequence = tf.constant([sequence])
-            token_type_ids = tf.constant([token_type_ids])
-        elif return_tensors == 'pt' and is_torch_available():
-            sequence = torch.tensor([sequence])
-            token_type_ids = torch.tensor([token_type_ids])
-        elif return_tensors is not None:
-            logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
        encoded_inputs["input_ids"] = sequence
        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
@@ -1003,7 +998,7 @@ class PreTrainedTokenizer(object):
        )
        if pad_to_max_length and max_length is None and self.max_len > 10000:
-            logger.warning("Sequence can't be padded as the maximum  ")
+            logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.")
        if needs_to_be_padded:
            difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
@@ -1016,10 +1011,9 @@ class PreTrainedTokenizer(object):
                if return_special_tokens_mask:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
            elif self.padding_side == 'left':
                if return_attention_mask:
-                    encoded_inputs["attention_mask"] =  [0] * difference + [1] * len(encoded_inputs["input_ids"])
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
                if return_token_type_ids:
                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
                if return_special_tokens_mask:
@@ -1031,7 +1025,26 @@ class PreTrainedTokenizer(object):
        elif return_attention_mask:
            encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
+        # Prepare inputs as tensors if asked
+        if return_tensors == 'tf' and is_tf_available():
+            encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
+            encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
+        elif return_tensors == 'pt' and is_torch_available():
+            encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
+            encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
+        elif return_tensors is not None:
+            logger.warning(
+                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+                    return_tensors))
        return encoded_inputs
    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):

--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -549,6 +549,10 @@ class XLMTokenizer(PreTrainedTokenizer):
                                           additional_special_tokens=additional_special_tokens,
                                           **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens 
        # cache of sm.MosesPunctNormalizer instance
        self.cache_moses_punct_normalizer = dict()
        # cache of sm.MosesTokenizer instance

--- a/utils/link_tester.py
+++ b/utils/link_tester.py
+""" Link tester.
+This little utility reads all the python files in the repository,
+scans for links pointing to S3 and tests the links one by one. Raises an error
+at the end of the scan if at least one link was reported broken.
+"""
+import os
+import re
+import sys
+import requests
+REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
+def list_python_files_in_repository():
+    """ List all python files in the repository.
+    This function assumes that the script is executed in the root folder.
+    """
+    source_code_files = []
+    for path, subdirs, files in os.walk("."):
+        if "templates" in path:
+            continue
+        for name in files:
+            if ".py" in name and ".pyc" not in name:
+                path_to_files = os.path.join(path, name)
+                source_code_files.append(path_to_files)
+    return source_code_files
+def find_all_links(file_paths):
+    links = []
+    for path in file_paths:
+        links += scan_code_for_links(path)
+    return links
+def scan_code_for_links(source):
+    """ Scans the file to find links using a regular expression.
+    Returns a list of links.
+    """
+    with open(source, 'r') as content:
+        content = content.read()
+        raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
+        links = [prefix + suffix for _, prefix, suffix in raw_links]
+    return links
+def check_all_links(links):
+    """ Check that the provided links are valid.
+    Links are considered valid if a HEAD request to the server
+    returns a 200 status code.
+    """
+    broken_links = []
+    for link in links:
+        head = requests.head(link)
+        if head.status_code != 200:
+            broken_links.append(link)
+    return broken_links
+if __name__ == "__main__":
+    file_paths = list_python_files_in_repository()
+    links = find_all_links(file_paths)
+    broken_links = check_all_links(links)
+    print("Looking for broken links to pre-trained models/configs/tokenizers...")
+    if broken_links:
+        print("The following links did not respond:")
+        for link in broken_links:
+            print("- {}".format(link))
+        sys.exit(1)
+    print("All links are ok.")