Merge branch 'master' into cli

c7be096c · thomwolf · 3492a6ec · 33adab2b · c7be096c · c7be096c
Commit c7be096c authored Dec 19, 2019 by thomwolf
12 changed files
--- a/transformers/tests/tokenization_gpt2_test.py
+++ b/transformers/tests/tokenization_gpt2_test.py
@@ -67,6 +67,5 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

-
 if __name__ == '__main__':
    unittest.main()
--- a/transformers/tests/tokenization_t5_test.py
+++ b/transformers/tests/tokenization_t5_test.py
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+
+from transformers.tokenization_t5 import (T5Tokenizer)
+from transformers.tokenization_xlnet import SPIECE_UNDERLINE
+
+from .tokenization_tests_commons import CommonTestCases
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                    'fixtures/test_sentencepiece.model')
+
+class T5TokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = T5Tokenizer
+
+    def setUp(self):
+        super(T5TokenizationTest, self).setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return T5Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"This is a test"
+        output_text = u"This is a test"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
+
+        tokens = tokenizer.tokenize(u'This is a test')
+        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                    u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                    u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                    SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids, [8, 21, 84, 55, 24, 19, 7, 0,
+                602, 347, 347, 347, 3, 12, 66,
+                46, 72, 80, 6, 0, 4])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                        u'or', u'n', SPIECE_UNDERLINE + u'in',
+                                        SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
+                                        SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
+                                        u'<unk>', u'.'])
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -232,6 +232,15 @@ class CommonTestCases:
            self.assertNotEqual(len(tokens_2), 0)
            self.assertIsInstance(text_2, (str, unicode))

+        def test_encode_decode_with_spaces(self):
+            tokenizer = self.get_tokenizer()
+
+            new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
+            tokenizer.add_tokens(new_toks)
+            input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
+            encoded = tokenizer.encode(input, add_special_tokens=False)
+            decoded = tokenizer.decode(encoded)
+            self.assertEqual(decoded, input)

        def test_pretrained_model_lists(self):
            weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())

--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -6,18 +6,26 @@ from distutils.util import strtobool
 from transformers.file_utils import _tf_available, _torch_available


-try:
-    run_slow = os.environ["RUN_SLOW"]
-except KeyError:
-    # RUN_SLOW isn't set, default to skipping slow tests.
-    _run_slow_tests = False
-else:
-    # RUN_SLOW is set, convert it to True or False.
+SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
+
+
+def parse_flag_from_env(key, default=False):
    try:
-        _run_slow_tests = strtobool(run_slow)
-    except ValueError:
-        # More values are supported, but let's keep the message simple.
-        raise ValueError("If set, RUN_SLOW must be yes or no.")
+        value = os.environ[key]
+    except KeyError:
+        # KEY isn't set, default to `default`.
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = strtobool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError("If set, {} must be yes or no.".format(key))
+    return _value
+
+_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
+_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)


 def slow(test_case):
@@ -33,6 +41,19 @@ def slow(test_case):
    return test_case


+def custom_tokenizers(test_case):
+    """
+    Decorator marking a test for a custom tokenizer.
+
+    Custom tokenizers require additional dependencies, and are skipped
+    by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
+    to a truthy value to run them.
+    """
+    if not _run_custom_tokenizers:
+        test_case = unittest.skip("test of custom tokenizers")(test_case)
+    return test_case
+
+
 def require_torch(test_case):
    """
    Decorator marking a test that requires PyTorch.
@@ -59,6 +80,6 @@ def require_tf(test_case):

 if _torch_available:
    # Set the USE_CUDA environment variable to select a GPU.
-    torch_device = "cuda" if os.environ.get("USE_CUDA") else "cpu"
+    torch_device = "cuda" if parse_flag_from_env("USE_CUDA") else "cpu"
 else:
    torch_device = None
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging

 from .tokenization_bert import BertTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_ctrl import CTRLTokenizer
@@ -29,6 +30,7 @@ from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
 from .tokenization_camembert import CamembertTokenizer
 from .tokenization_albert import AlbertTokenizer
+from .tokenization_t5 import T5Tokenizer

 logger = logging.getLogger(__name__)

@@ -43,6 +45,7 @@ class AutoTokenizer(object):

        The tokenizer class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Tokenizer (T5 model)
            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
            - contains `albert`: AlbertTokenizer (ALBERT model)
            - contains `camembert`: CamembertTokenizer (CamemBERT model)
@@ -68,10 +71,12 @@ class AutoTokenizer(object):

        The tokenizer class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Tokenizer (T5 model)
            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
            - contains `albert`: AlbertTokenizer (ALBERT model)
            - contains `camembert`: CamembertTokenizer (CamemBERT model)
            - contains `roberta`: RobertaTokenizer (RoBERTa model)
+            - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
            - contains `bert`: BertTokenizer (Bert model)
            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
@@ -84,6 +89,7 @@ class AutoTokenizer(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.

@@ -106,11 +112,19 @@ class AutoTokenizer(object):

        Examples::

-            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
-            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+            # Download vocabulary from S3 and cache.
+            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+
+            # Download vocabulary from S3 (user-uploaded) and cache.
+            tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+
+            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')

        """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
            return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        elif 'albert' in pretrained_model_name_or_path:
            return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
@@ -118,6 +132,8 @@ class AutoTokenizer(object):
            return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'bert-base-japanese' in pretrained_model_name_or_path:
+            return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        elif 'openai-gpt' in pretrained_model_name_or_path:

--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -46,6 +46,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
        'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
        'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
        'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
+        'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
+        'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
    }
 }

@@ -65,6 +67,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'bert-base-cased-finetuned-mrpc': 512,
    'bert-base-german-dbmdz-cased': 512,
    'bert-base-german-dbmdz-uncased': 512,
+    'bert-base-finnish-cased-v1': 512,
+    'bert-base-finnish-uncased-v1': 512,
 }

 PRETRAINED_INIT_CONFIGURATION = {
@@ -83,6 +87,8 @@ PRETRAINED_INIT_CONFIGURATION = {
    'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
    'bert-base-german-dbmdz-cased': {'do_lower_case': False},
    'bert-base-german-dbmdz-uncased': {'do_lower_case': True},
+    'bert-base-finnish-cased-v1': {'do_lower_case': False},
+    'bert-base-finnish-uncased-v1': {'do_lower_case': True},
 }


@@ -113,12 +119,12 @@ class BertTokenizer(PreTrainedTokenizer):

    Args:
        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
            minimum of this value (if specified) and the underlying BERT model's sequence length.
        never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_wordpiece_only=False
+            do_basic_tokenize=True
    """

    vocab_files_names = VOCAB_FILES_NAMES

--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import six
+import unicodedata
+from io import open
+
+from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
+        'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
+        'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
+        'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'bert-base-japanese': 512,
+    'bert-base-japanese-whole-word-masking': 512,
+    'bert-base-japanese-char': 512,
+    'bert-base-japanese-char-whole-word-masking': 512
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'bert-base-japanese': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'wordpiece'
+    },
+    'bert-base-japanese-whole-word-masking':{
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'wordpiece'
+    },
+    'bert-base-japanese-char': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'character'
+    },
+    'bert-base-japanese-char-whole-word-masking': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'character'
+    }
+}
+
+
+class BertJapaneseTokenizer(BertTokenizer):
+    """BERT tokenizer for Japanese text"""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, do_lower_case=False,
+                 do_word_tokenize=True, do_subword_tokenize=True,
+                 word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
+                 never_split=None, unk_token='[UNK]', sep_token='[SEP]',
+                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
+        """Constructs a MecabBertTokenizer.
+
+        Args:
+            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input.
+                Only has an effect when do_basic_tokenize=True.
+            **do_word_tokenize**: (`optional`) boolean (default True)
+                Whether to do word tokenization.
+            **do_subword_tokenize**: (`optional`) boolean (default True)
+                Whether to do subword tokenization.
+            **word_tokenizer_type**: (`optional`) string (default "basic")
+                Type of word tokenizer.
+            **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
+                Type of subword tokenizer.
+        """
+        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+                                            pad_token=pad_token, cls_token=cls_token,
+                                            mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+
+        self.do_word_tokenize = do_word_tokenize
+        if do_word_tokenize:
+            if word_tokenizer_type == 'basic':
+                self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                     never_split=never_split,
+                                                     tokenize_chinese_chars=False)
+            elif word_tokenizer_type == 'mecab':
+                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
+                                                     never_split=never_split)
+            else:
+                raise ValueError(
+                    "Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
+
+        self.do_subword_tokenize = do_subword_tokenize
+        if do_subword_tokenize:
+            if subword_tokenizer_type == 'wordpiece':
+                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
+                                                            unk_token=self.unk_token)
+            elif subword_tokenizer_type == 'character':
+                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
+                                                            unk_token=self.unk_token)
+            else:
+                raise ValueError(
+                    "Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
+
+
+    def _tokenize(self, text):
+        if self.do_word_tokenize:
+            tokens = self.word_tokenizer.tokenize(text,
+                                                  never_split=self.all_special_tokens)
+        else:
+            tokens = [text]
+
+        if self.do_subword_tokenize:
+            split_tokens = [sub_token for token in tokens
+                            for sub_token in self.subword_tokenizer.tokenize(token)]
+        else:
+            split_tokens = tokens
+
+        return split_tokens
+
+
+class MecabTokenizer(object):
+    """Runs basic tokenization with MeCab morphological parser."""
+
+    def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
+        """Constructs a MecabTokenizer.
+
+        Args:
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input.
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split if never_split is not None else []
+        self.normalize_text = normalize_text
+
+        import MeCab
+        self.mecab = MeCab.Tagger()
+
+    def tokenize(self, text, never_split=None, **kwargs):
+        """Tokenizes a piece of text."""
+        if self.normalize_text:
+            text = unicodedata.normalize('NFKC', text)
+
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        tokens = []
+
+        if six.PY2:
+            mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8')
+        else:
+            mecab_output = self.mecab.parse(text)
+
+        cursor = 0
+        for line in mecab_output.split('\n'):
+            if line == 'EOS':
+                break
+
+            token, _ = line.split('\t')
+            token_start = text.index(token, cursor)
+            token_end = token_start + len(token)
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+
+            tokens.append(token)
+            cursor = token_end
+
+        return tokens
+
+
+class CharacterTokenizer(object):
+    """Runs Character tokenziation."""
+
+    def __init__(self, vocab, unk_token, normalize_text=True):
+        """Constructs a CharacterTokenizer.
+
+        Args:
+            **vocab**:
+                Vocabulary object.
+            **unk_token**: str
+                A special symbol for out-of-vocabulary token.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.normalize_text = normalize_text
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into characters.
+
+        For example:
+            input = "apple"
+            output = ["a", "p", "p", "l", "e"]
+        Args:
+            text: A single token or whitespace separated tokens.
+                This should have already been passed through `BasicTokenizer`.
+        Returns:
+            A list of characters.
+        """
+        if self.normalize_text:
+            text = unicodedata.normalize('NFKC', text)
+
+        output_tokens = []
+        for i, char in enumerate(text):
+            if char not in self.vocab:
+                output_tokens.append(self.unk_token)
+                continue
+
+            output_tokens.append(char)
+
+        return output_tokens
--- a/transformers/tokenization_distilbert.py
+++ b/transformers/tokenization_distilbert.py
@@ -53,12 +53,12 @@ class DistilBertTokenizer(BertTokenizer):

    Args:
        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
            minimum of this value (if specified) and the underlying BERT model's sequence length.
        never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_wordpiece_only=False
+            do_basic_tokenize=True
    """

    vocab_files_names = VOCAB_FILES_NAMES

--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
+# coding=utf-8
+# Copyright 2018 T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization class for model T5."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+import os
+import re
+import six
+from shutil import copyfile
+
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+SPIECE_UNDERLINE = u'▁'
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to file names for serializing Tokenizer instances
+####################################################
+VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to pretrained vocabulary URL for all the model shortcut names.
+####################################################
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+    }
+}
+
+####################################################
+# Mapping from model shortcut names to max length of inputs
+####################################################
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    't5-small': 512,
+    't5-base': 512,
+    't5-large': 512,
+    't5-3b': 512,
+    't5-11b': 512,
+}
+
+class T5Tokenizer(PreTrainedTokenizer):
+    """
+        SentencePiece based tokenizer. Peculiarities:
+
+            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+            - `extra_ids` add a number of extra ids added to the end of the vocabulary for use as sentinels.
+                These tokens are accessible as `<extra_id_{%d}>` where `{%d}` is a number between 0 and extra_ids-1.
+                Extra tokens are indexed from the end of the vocabulary up to beginnning (<extra_id_0> is the last token in the vocabulary)
+                (like in T5 preprocessing
+                see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, eos_token="</s>", unk_token="<unk>",
+                 pad_token="<pad>", extra_ids=100, additional_special_tokens=None, **kwargs):
+        # Add extra_ids to the special token list
+        if extra_ids > 0:
+            if additional_special_tokens is None:
+                additional_special_tokens = []
+            additional_special_tokens.extend([u"<extra_id_{}>".format(i) for i in range(extra_ids)])
+
+        super(T5Tokenizer, self).__init__(eos_token=eos_token, unk_token=unk_token,
+                                          pad_token=pad_token, additional_special_tokens=additional_special_tokens,
+                                          **kwargs)
+
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use T5Tokenizer:"
+                           "https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+
+        self.vocab_file = vocab_file
+        self._extra_ids = extra_ids
+
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+
+    @property
+    def vocab_size(self):
+        return self.sp_model.get_piece_size() + self._extra_ids
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text, return_unicode=True, sample=False):
+        """ Take as input a string and return a list of strings (tokens) for words/sub-words
+        """
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+
+        # convert back to unicode for py2
+        if six.PY2 and return_unicode:
+            ret_pieces = []
+            for piece in pieces:
+                if isinstance(piece, str):
+                    piece = piece.decode('utf-8')
+                ret_pieces.append(piece)
+            pieces = ret_pieces
+
+        return pieces
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        if token.startswith(u"<extra_id_"):
+            l = re.match(r'<extra_id_(\d+)>', token)
+            num = int(l.group(1))
+            return self.vocab_size - num - 1
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index, return_unicode=True):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        if index < self.sp_model.get_piece_size():
+            token = self.sp_model.IdToPiece(index)
+        else:
+            token = u"<extra_id_{}>".format(self.vocab_size - 1 - index)
+        if six.PY2 and return_unicode and isinstance(token, str):
+            token = token.decode('utf-8')
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = self.sp_model.decode_pieces(tokens)
+        return out_string
+
+    def save_vocabulary(self, save_directory):
+        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+            to a directory.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -25,7 +25,7 @@ import itertools
 import re
 from io import open

-from .file_utils import cached_path, is_tf_available, is_torch_available
+from .file_utils import cached_path, is_remote_url, hf_bucket_url, is_tf_available, is_torch_available

 if is_tf_available():
    import tensorflow as tf
@@ -226,7 +226,7 @@ class PreTrainedTokenizer(object):

        self.max_len = max_len if max_len is not None else int(1e12)

-        # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed.
+        # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
        self.padding_side = kwargs.pop('padding_side', self.padding_side)
        
        # Added tokens
@@ -255,6 +255,7 @@ class PreTrainedTokenizer(object):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.

@@ -282,6 +283,9 @@ class PreTrainedTokenizer(object):
            # Download vocabulary from S3 and cache.
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

+            # Download vocabulary from S3 (user-uploaded) and cache.
+            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+
            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')

@@ -327,12 +331,15 @@ class PreTrainedTokenizer(object):
                if os.path.isdir(pretrained_model_name_or_path):
                    # If a directory is provided we look for the standard filenames
                    full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
-                else:
+                    if not os.path.exists(full_file_name):
+                        logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                        full_file_name = None
+                elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                    # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
                    full_file_name = pretrained_model_name_or_path
-                if not os.path.exists(full_file_name):
-                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
-                    full_file_name = None
+                else:
+                    full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
+                
                vocab_files[file_id] = full_file_name

            # Look for the additional tokens files
@@ -628,12 +635,13 @@ class PreTrainedTokenizer(object):
            Take care of added tokens.

            text: The sequence to be encoded.
-            return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
            **kwargs: passed to the child `self.tokenize()` method
        """
+        all_special_tokens = self.all_special_tokens
+
        def lowercase_text(t):
            # convert non-special tokens to lowercase
-            escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
+            escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
            pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \
                      r'(.+?)'
            return re.sub(
@@ -663,7 +671,7 @@ class PreTrainedTokenizer(object):
            return result

        def split_on_tokens(tok_list, text):
-            if not text:
+            if not text.strip():
                return []
            if not tok_list:
                return self._tokenize(text, **kwargs)
@@ -674,17 +682,17 @@ class PreTrainedTokenizer(object):
                tokenized_text = []
                for sub_text in text_list:
                    if sub_text not in self.added_tokens_encoder \
-                            and sub_text not in self.all_special_tokens:
+                            and sub_text not in all_special_tokens:
                        tokenized_text += split_on_token(tok, sub_text)
                    else:
                        tokenized_text += [sub_text]
                text_list = tokenized_text

            return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \
-                    in self.added_tokens_encoder and token not in self.all_special_tokens \
+                    in self.added_tokens_encoder and token not in all_special_tokens \
                    else [token] for token in tokenized_text)))

-        added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
+        added_tokens = list(self.added_tokens_encoder.keys()) + all_special_tokens
        tokenized_text = split_on_tokens(added_tokens, text)
        return tokenized_text

@@ -1003,7 +1011,7 @@ class PreTrainedTokenizer(object):
            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                or PyTorch torch.Tensor instead of a list of python integers.
            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoir returning attention mask (default True)
+            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).

@@ -1048,24 +1056,13 @@ class PreTrainedTokenizer(object):
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            special_tokens_mask = self.get_special_tokens_mask(ids, pair_ids)
        else:
            sequence = ids + pair_ids if pair else ids
            token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
-            special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0))
+
        if return_special_tokens_mask:
            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)

-        # Prepare inputs as tensors if asked
-        if return_tensors == 'tf' and is_tf_available():
-            sequence = tf.constant([sequence])
-            token_type_ids = tf.constant([token_type_ids])
-        elif return_tensors == 'pt' and is_torch_available():
-            sequence = torch.tensor([sequence])
-            token_type_ids = torch.tensor([token_type_ids])
-        elif return_tensors is not None:
-            logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
-
        encoded_inputs["input_ids"] = sequence
        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
@@ -1089,7 +1086,7 @@ class PreTrainedTokenizer(object):
        )

        if pad_to_max_length and max_length is None and self.max_len > 10000:
-            logger.warning("Sequence can't be padded as the maximum  ")
+            logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.")

        if needs_to_be_padded:
            difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
@@ -1102,10 +1099,9 @@ class PreTrainedTokenizer(object):
                if return_special_tokens_mask:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
-
            elif self.padding_side == 'left':
                if return_attention_mask:
-                    encoded_inputs["attention_mask"] =  [0] * difference + [1] * len(encoded_inputs["input_ids"])
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
                if return_token_type_ids:
                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
                if return_special_tokens_mask:
@@ -1117,7 +1113,26 @@ class PreTrainedTokenizer(object):
            
        elif return_attention_mask:
            encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
-            
+
+        # Prepare inputs as tensors if asked
+        if return_tensors == 'tf' and is_tf_available():
+            encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
+            encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
+
+        elif return_tensors == 'pt' and is_torch_available():
+            encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
+            encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
+        elif return_tensors is not None:
+            logger.warning(
+                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+                    return_tensors))
+
        return encoded_inputs

    def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):
@@ -1251,12 +1266,12 @@ class PreTrainedTokenizer(object):
                if current_sub_text:
                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
                    current_sub_text = []
-                sub_texts.append(" " + token)
+                sub_texts.append(token)
            else:
                current_sub_text.append(token)
        if current_sub_text:
            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-        text = ''.join(sub_texts)
+        text = ' '.join(sub_texts)

        if clean_up_tokenization_spaces:
            clean_text = self.clean_up_tokenization(text)

--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -549,6 +549,10 @@ class XLMTokenizer(PreTrainedTokenizer):
                                           additional_special_tokens=additional_special_tokens,
                                           **kwargs)

+
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens 
+
        # cache of sm.MosesPunctNormalizer instance
        self.cache_moses_punct_normalizer = dict()
        # cache of sm.MosesTokenizer instance

--- a/utils/link_tester.py
+++ b/utils/link_tester.py
+""" Link tester.
+
+This little utility reads all the python files in the repository,
+scans for links pointing to S3 and tests the links one by one. Raises an error
+at the end of the scan if at least one link was reported broken.
+"""
+import os
+import re
+import sys
+
+import requests
+
+
+REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
+
+
+def list_python_files_in_repository():
+    """ List all python files in the repository.
+
+    This function assumes that the script is executed in the root folder.
+    """
+    source_code_files = []
+    for path, subdirs, files in os.walk("."):
+        if "templates" in path:
+            continue
+        for name in files:
+            if ".py" in name and ".pyc" not in name:
+                path_to_files = os.path.join(path, name)
+                source_code_files.append(path_to_files)
+
+    return source_code_files
+
+
+def find_all_links(file_paths):
+    links = []
+    for path in file_paths:
+        links += scan_code_for_links(path)
+
+    return links
+
+
+def scan_code_for_links(source):
+    """ Scans the file to find links using a regular expression.
+    Returns a list of links.
+    """
+    with open(source, 'r') as content:
+        content = content.read()
+        raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
+        links = [prefix + suffix for _, prefix, suffix in raw_links]
+
+    return links
+
+
+def check_all_links(links):
+    """ Check that the provided links are valid.
+
+    Links are considered valid if a HEAD request to the server
+    returns a 200 status code.
+    """
+    broken_links = []
+    for link in links:
+        head = requests.head(link)
+        if head.status_code != 200:
+            broken_links.append(link)
+
+    return broken_links
+
+
+if __name__ == "__main__":
+    file_paths = list_python_files_in_repository()
+    links = find_all_links(file_paths)
+    broken_links = check_all_links(links)
+    print("Looking for broken links to pre-trained models/configs/tokenizers...")
+    if broken_links:
+        print("The following links did not respond:")
+        for link in broken_links:
+            print("- {}".format(link))
+        sys.exit(1)
+    print("All links are ok.")