Merge pull request #1092 from shijie-wu/xlm-tokenization

Added cleaned configuration properties for tokenizer with serialization - improve tokenization of XLM

Merge pull request #1092 from shijie-wu/xlm-tokenization
Added cleaned configuration properties for tokenizer with serialization - improve tokenization of XLM
d2f21f08 · Thomas Wolf · GitHub · 12b9cc9e · 7044ed6b · d2f21f08
Unverified Commit d2f21f08 authored Aug 30, 2019 by Thomas Wolf Committed by GitHub Aug 30, 2019
17 changed files
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -44,6 +44,8 @@ XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
+    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.json",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.json",
 }
 XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
@@ -54,6 +56,8 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
+    'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
+    'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json",
 }


@@ -114,6 +118,7 @@ class XLMConfig(PretrainedConfig):
                 causal=False,
                 asm=False,
                 n_langs=1,
+                 use_lang_emb=True,
                 max_position_embeddings=512,
                 embed_init_std=2048 ** -0.5,
                 layer_norm_eps=1e-12,
@@ -157,6 +162,7 @@ class XLMConfig(PretrainedConfig):
            self.causal = causal
            self.asm = asm
            self.n_langs = n_langs
+            self.use_lang_emb = use_lang_emb
            self.layer_norm_eps = layer_norm_eps
            self.bos_index = bos_index
            self.eos_index = eos_index
@@ -488,7 +494,7 @@ class XLMModel(XLMPreTrainedModel):

    """
    ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
-                  'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads', 
+                  'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads', 
                  'hidden_dim', 'dropout', 'attention_dropout', 'asm',
                  'asm_cutoffs', 'asm_div_value']

@@ -507,6 +513,7 @@ class XLMModel(XLMPreTrainedModel):

        # dictionary / languages
        self.n_langs = config.n_langs
+        self.use_lang_emb = config.use_lang_emb
        self.n_words = config.n_words
        self.eos_index = config.eos_index
        self.pad_index = config.pad_index
@@ -529,7 +536,7 @@ class XLMModel(XLMPreTrainedModel):
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
        if config.sinusoidal_embeddings:
            create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
-        if config.n_langs > 1:
+        if config.n_langs > 1 and config.use_lang_emb:
            self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
        self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
@@ -628,7 +635,7 @@ class XLMModel(XLMPreTrainedModel):
        # embeddings
        tensor = self.embeddings(input_ids)
        tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
-        if langs is not None:
+        if langs is not None and self.use_lang_emb:
            tensor = tensor + self.lang_embeddings(langs)
        if token_type_ids is not None:
            tensor = tensor + self.embeddings(token_type_ids)

--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -41,8 +41,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

-    def get_tokenizer(self):
-        return self.tokenizer_class.from_pretrained(self.tmpdirname)
+    def get_tokenizer(self, **kwargs):
+        return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self):
        input_text = u"UNwant\u00E9d,running"

--- a/pytorch_transformers/tests/tokenization_dilbert_test.py
+++ b/pytorch_transformers/tests/tokenization_dilbert_test.py
@@ -27,8 +27,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):

    tokenizer_class = DistilBertTokenizer

-    def get_tokenizer(self):
-        return DistilBertTokenizer.from_pretrained(self.tmpdirname)
+    def get_tokenizer(self, **kwargs):
+        return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def test_sequence_builders(self):
        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -44,8 +44,9 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
        with open(self.merges_file, "w") as fp:
            fp.write("\n".join(merges))

-    def get_tokenizer(self):
-        return GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self):
        input_text = u"lower newer"

--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -45,8 +45,8 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
        with open(self.merges_file, "w") as fp:
            fp.write("\n".join(merges))

-    def get_tokenizer(self):
-        return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname)
+    def get_tokenizer(self, **kwargs):
+        return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self):
        input_text = u"lower newer"

--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -43,8 +43,9 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
        with open(self.merges_file, "w") as fp:
            fp.write("\n".join(merges))

-    def get_tokenizer(self):
-        return RobertaTokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
+    def get_tokenizer(self, **kwargs):
+        kwargs.update(self.special_tokens_map)
+        return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self):
        input_text = u"lower newer"

--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -49,23 +49,32 @@ class CommonTestCases:
        def tearDown(self):
            shutil.rmtree(self.tmpdirname)

-        def get_tokenizer(self):
+        def get_tokenizer(self, **kwargs):
            raise NotImplementedError

        def get_input_output_texts(self):
            raise NotImplementedError

        def test_save_and_load_tokenizer(self):
+            # safety check on max_len default value so we are sure the test works
            tokenizer = self.get_tokenizer()
+            self.assertNotEqual(tokenizer.max_len, 42)
+
+            # Now let's start the test
+            tokenizer = self.get_tokenizer(max_len=42)

            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")

            with TemporaryDirectory() as tmpdirname:
                tokenizer.save_pretrained(tmpdirname)
-                tokenizer = tokenizer.from_pretrained(tmpdirname)
+                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
+
+                after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+                self.assertListEqual(before_tokens, after_tokens)

-            after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
-            self.assertListEqual(before_tokens, after_tokens)
+                self.assertEqual(tokenizer.max_len, 42)
+                tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
+                self.assertEqual(tokenizer.max_len, 43)

        def test_pickle_tokenizer(self):
            tokenizer = self.get_tokenizer()

--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -37,8 +37,9 @@ class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

-    def get_tokenizer(self):
-        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True)
+    def get_tokenizer(self, **kwargs):
+        kwargs['lower_case'] = True
+        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self):
        input_text = u"<unk> UNwanted , running"

--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -44,8 +44,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
        with open(self.merges_file, "w") as fp:
            fp.write("\n".join(merges))

-    def get_tokenizer(self):
-        return XLMTokenizer.from_pretrained(self.tmpdirname)
+    def get_tokenizer(self, **kwargs):
+        return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self):
        input_text = u"lower newer"

--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -35,8 +35,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
        tokenizer.save_pretrained(self.tmpdirname)

-    def get_tokenizer(self):
-        return XLNetTokenizer.from_pretrained(self.tmpdirname)
+    def get_tokenizer(self, **kwargs):
+        return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self):
        input_text = u"This is a test"

--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -63,6 +63,23 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    'bert-base-cased-finetuned-mrpc': 512,
 }

+PRETRAINED_INIT_CONFIGURATION = {
+    'bert-base-uncased': {'do_lower_case': True},
+    'bert-large-uncased': {'do_lower_case': True},
+    'bert-base-cased': {'do_lower_case': False},
+    'bert-large-cased': {'do_lower_case': False},
+    'bert-base-multilingual-uncased': {'do_lower_case': True},
+    'bert-base-multilingual-cased': {'do_lower_case': False},
+    'bert-base-chinese': {'do_lower_case': False},
+    'bert-base-german-cased': {'do_lower_case': False},
+    'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
+    'bert-large-cased-whole-word-masking': {'do_lower_case': False},
+    'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
+    'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
+    'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
+}
+
+
 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
@@ -100,6 +117,7 @@ class BertTokenizer(PreTrainedTokenizer):

    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
@@ -202,24 +220,6 @@ class BertTokenizer(PreTrainedTokenizer):
                index += 1
        return (vocab_file,)

-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """ Instantiate a BertTokenizer from pre-trained vocabulary files.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
-            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
-                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
-                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
-                               "you may want to check this behavior.")
-                kwargs['do_lower_case'] = False
-            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
-                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
-                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
-                               "but you may want to check this behavior.")
-                kwargs['do_lower_case'] = True
-
-        return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-

 class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -95,7 +95,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
            # in a library like ours, at all.
            vocab_dict = torch.load(pretrained_vocab_file)
            for key, value in vocab_dict.items():
-                self.__dict__[key] = value
+                if key not in self.__dict__:
+                    self.__dict__[key] = value

        if vocab_file is not None:
            self.build_vocab()

--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -20,6 +20,7 @@ import logging
 import os
 import json
 import six
+import copy
 from io import open

 from .file_utils import cached_path
@@ -28,6 +29,7 @@ logger = logging.getLogger(__name__)

 SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
 ADDED_TOKENS_FILE = 'added_tokens.json'
+TOKENIZER_CONFIG_FILE = 'tokenizer_config.json'

 class PreTrainedTokenizer(object):
    """ Base class for all tokenizers.
@@ -40,6 +42,7 @@ class PreTrainedTokenizer(object):
        - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
        - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
        - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
+        - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.

    Parameters:

@@ -61,6 +64,7 @@ class PreTrainedTokenizer(object):
    """
    vocab_files_names = {}
    pretrained_vocab_files_map = {}
+    pretrained_init_configuration = {}
    max_model_input_sizes = {}

    SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
@@ -166,12 +170,15 @@ class PreTrainedTokenizer(object):
        self._additional_special_tokens = []

        self.max_len = max_len if max_len is not None else int(1e12)
-        self.max_len_single_sentence = self.max_len
-        self.max_len_sentences_pair = self.max_len

+        # Added tokens
        self.added_tokens_encoder = {}
        self.added_tokens_decoder = {}

+        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
+        self.init_inputs = ()
+        self.init_kwargs = {}
+
        for key, value in kwargs.items():
            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
                if key == 'additional_special_tokens':
@@ -231,17 +238,20 @@ class PreTrainedTokenizer(object):


    @classmethod
-    def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+    def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
        cache_dir = kwargs.pop('cache_dir', None)
        force_download = kwargs.pop('force_download', False)
        proxies = kwargs.pop('proxies', None)

        s3_models = list(cls.max_model_input_sizes.keys())
        vocab_files = {}
+        init_configuration = {}
        if pretrained_model_name_or_path in s3_models:
            # Get the vocabulary from AWS S3 bucket
            for file_id, map_list in cls.pretrained_vocab_files_map.items():
                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
+            if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
+                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path]
        else:
            # Get the vocabulary from local files
            logger.info(
@@ -264,15 +274,17 @@ class PreTrainedTokenizer(object):
                vocab_files[file_id] = full_file_name

            # Look for the additional tokens files
-            all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
-                                     'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
+            additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
+                                      'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE,
+                                      'tokenizer_config_file': TOKENIZER_CONFIG_FILE,
+                                      }

            # If a path to a file was provided, get the parent directory
            saved_directory = pretrained_model_name_or_path
            if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
                saved_directory = os.path.dirname(saved_directory)

-            for file_id, file_name in all_vocab_files_names.items():
+            for file_id, file_name in additional_files_names.items():
                full_file_name = os.path.join(saved_directory, file_name)
                if not os.path.exists(full_file_name):
                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
@@ -315,28 +327,46 @@ class PreTrainedTokenizer(object):
                logger.info("loading file {} from cache at {}".format(
                    file_path, resolved_vocab_files[file_id]))

+        # Prepare tokenizer initialization kwargs
+        # Did we saved some inputs and kwargs to reload ?
+        tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None)
+        if tokenizer_config_file is not None:
+            init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8"))
+            saved_init_inputs = init_kwargs.pop('init_inputs', ())
+            if not init_inputs:
+                init_inputs = saved_init_inputs
+        else:
+            init_kwargs = init_configuration
+
+        # Update with newly provided kwargs
+        init_kwargs.update(kwargs)
+
        # Set max length if needed
        if pretrained_model_name_or_path in cls.max_model_input_sizes:
            # if we're using a pretrained model, ensure the tokenizer
            # wont index sequences longer than the number of positional embeddings
            max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
            if max_len is not None and isinstance(max_len, (int, float)):
-                kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+                init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len)

-        # Merge resolved_vocab_files arguments in kwargs.
+        # Merge resolved_vocab_files arguments in init_kwargs.
        added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
        special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
        for args_name, file_path in resolved_vocab_files.items():
-            if args_name not in kwargs:
-                kwargs[args_name] = file_path
+            if args_name not in init_kwargs:
+                init_kwargs[args_name] = file_path
        if special_tokens_map_file is not None:
            special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
            for key, value in special_tokens_map.items():
-                if key not in kwargs:
-                    kwargs[key] = value
+                if key not in init_kwargs:
+                    init_kwargs[key] = value

        # Instantiate tokenizer.
-        tokenizer = cls(*inputs, **kwargs)
+        tokenizer = cls(*init_inputs, **init_kwargs)
+
+        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
+        tokenizer.init_inputs = init_inputs
+        tokenizer.init_kwargs = init_kwargs

        # Add supplementary tokens.
        if added_tokens_file is not None:
@@ -349,8 +379,13 @@ class PreTrainedTokenizer(object):


    def save_pretrained(self, save_directory):
-        """ Save the tokenizer vocabulary files (with added tokens) and the
-            special-tokens-to-class-attributes-mapping to a directory.
+        """ Save the tokenizer vocabulary files together with:
+                - added tokens,
+                - special-tokens-to-class-attributes-mapping,
+                - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
+
+            This won't save modifications other than (added tokens and special token mapping) you may have
+            applied to the tokenizer after the instantion (e.g. modifying tokenizer.do_lower_case after creation).

            This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
        """
@@ -360,6 +395,15 @@ class PreTrainedTokenizer(object):

        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
+        tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
+
+        tokenizer_config = copy.deepcopy(self.init_kwargs)
+        tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs)
+        for file_id in self.vocab_files_names.keys():
+            tokenizer_config.pop(file_id, None)
+
+        with open(tokenizer_config_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(tokenizer_config, ensure_ascii=False))

        with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
@@ -566,7 +610,7 @@ class PreTrainedTokenizer(object):
    def _convert_token_to_id(self, token):
        raise NotImplementedError

-    def encode(self, text, text_pair=None, add_special_tokens=False):
+    def encode(self, text, text_pair=None, add_special_tokens=False, **kwargs):
        """
        Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
        
@@ -577,15 +621,16 @@ class PreTrainedTokenizer(object):
            text_pair: Optional second sequence to be encoded.
            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
                to their model.
+            **kwargs: passed to the `self.tokenize()` method
        """
        if text_pair is None:
            if add_special_tokens:
-                return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
+                return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text, **kwargs)))
            else:
-                return self.convert_tokens_to_ids(self.tokenize(text))
+                return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))

-        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
-        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)]
+        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)]
+        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)]

        if add_special_tokens:
            return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)

--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -61,7 +61,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

-    def __init__(self, vocab_file, max_len=None,
+    def __init__(self, vocab_file,
                 do_lower_case=False, remove_space=True, keep_accents=False,
                 bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
                 pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",

--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,6 @@ requests
 # For OpenAI GPT
 regex
 # For XLNet
-sentencepiece
\ No newline at end of file
+sentencepiece
+# For XLM
+sacremoses
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,8 @@ setup(
                      'requests',
                      'tqdm',
                      'regex',
-                      'sentencepiece'],
+                      'sentencepiece',
+                      'sacremoses'],
    entry_points={
      'console_scripts': [
        "pytorch_transformers=pytorch_transformers.__main__:main",