Merge pull request #910 from huggingface/auto_models

Adding AutoTokenizer and AutoModel classes that automatically detect architecture - Clean up tokenizers

Merge pull request #910 from huggingface/auto_models
Adding AutoTokenizer and AutoModel classes that automatically detect architecture - Clean up tokenizers
4fc9f9ef · Thomas Wolf · GitHub · 3a126e73 · d43dc48b · 4fc9f9ef
Unverified Commit 4fc9f9ef authored Aug 05, 2019 by Thomas Wolf Committed by GitHub Aug 05, 2019
20 changed files
--- a/hubconfs/xlm_hubconf.py
+++ b/hubconfs/xlm_hubconf.py
@@ -17,16 +17,16 @@ xlm_start_docstring = """
    Example:
        # Load the tokenizer
-        >>> import torch
+        import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
        #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
+        text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
+        text_2 = "Jim Henson was a puppeteer"
-        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        indexed_tokens_1 = tokenizer.encode(text_1)
-        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        indexed_tokens_2 = tokenizer.encode(text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 """
 # A lot of models share the same param doc. Use a decorator
@@ -76,11 +76,11 @@ def xlmTokenizer(*args, **kwargs):
             Default: None
    Example:
-        >>> import torch
+        import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
-        >>> text = "Who was Jim Henson ?"
+        text = "Who was Jim Henson ?"
-        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+        indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
@@ -91,11 +91,11 @@ def xlmTokenizer(*args, **kwargs):
 def xlmModel(*args, **kwargs):
    """
        # Load xlmModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
-        >>> model.eval()
+        model.eval()
        # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                hidden_states_1, mems = model(tokens_tensor_1)
                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
    """
@@ -108,26 +108,26 @@ def xlmModel(*args, **kwargs):
 def xlmLMHeadModel(*args, **kwargs):
    """
        #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
+        text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
+        text_2 = "Jim Henson was a puppeteer"
-        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        indexed_tokens_1 = tokenizer.encode(text_1)
-        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        indexed_tokens_2 = tokenizer.encode(text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
        # Load xlnetLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
-        >>> model.eval()
+        model.eval()
        # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                predictions_1, mems = model(tokens_tensor_1)
                predictions_2, mems = model(tokens_tensor_2, mems=mems)
        # Get the predicted last token
-        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        >>> predicted_token = tokenizer.decode([predicted_index])
+        predicted_token = tokenizer.decode([predicted_index])
-        >>> assert predicted_token == ' who'
+        assert predicted_token == ' who'
    """
    model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
    return model
@@ -142,25 +142,25 @@ def xlmLMHeadModel(*args, **kwargs):
 #     Example:
 #         # Load the tokenizer
-#         >>> import torch
+#         import torch
-#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
+#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
 #         #  Prepare tokenized input
-#         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         >>> tokenized_text1 = tokenizer.tokenize(text1)
+#         tokenized_text1 = tokenizer.tokenize(text1)
-#         >>> tokenized_text2 = tokenizer.tokenize(text2)
+#         tokenized_text2 = tokenizer.tokenize(text2)
-#         >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 #         # Load xlnetForSequenceClassification
-#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
+#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
-#         >>> model.eval()
+#         model.eval()
 #         # Predict sequence classes logits
-#         >>> with torch.no_grad():
+#         with torch.no_grad():
 #                 lm_logits, mems = model(tokens_tensor)
 #     """
 #     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)

--- a/hubconfs/xlnet_hubconf.1.py
+++ b/hubconfs/xlnet_hubconf.1.py
@@ -53,11 +53,11 @@ def xlnetTokenizer(*args, **kwargs):
             Default: None
    Example:
-        >>> import torch
+        import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-        >>> text = "Who was Jim Henson ?"
+        text = "Who was Jim Henson ?"
-        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+        indexed_tokens = tokenizer.encode(tokenized_text)
    """
    tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer
@@ -72,23 +72,23 @@ def xlnetModel(*args, **kwargs):
    Example:
        # Load the tokenizer
-        >>> import torch
+        import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
        #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
+        text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
+        text_2 = "Jim Henson was a puppeteer"
-        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        indexed_tokens_1 = tokenizer.encode(text_1)
-        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        indexed_tokens_2 = tokenizer.encode(text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
        # Load xlnetModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
-        >>> model.eval()
+        model.eval()
        # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                hidden_states_1, mems = model(tokens_tensor_1)
                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
    """
@@ -106,30 +106,30 @@ def xlnetLMHeadModel(*args, **kwargs):
    Example:
        # Load the tokenizer
-        >>> import torch
+        import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
        #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
+        text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
+        text_2 = "Jim Henson was a puppeteer"
-        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        indexed_tokens_1 = tokenizer.encode(text_1)
-        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        indexed_tokens_2 = tokenizer.encode(text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
        # Load xlnetLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
-        >>> model.eval()
+        model.eval()
        # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                predictions_1, mems = model(tokens_tensor_1)
                predictions_2, mems = model(tokens_tensor_2, mems=mems)
        # Get the predicted last token
-        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        >>> predicted_token = tokenizer.decode([predicted_index])
+        predicted_token = tokenizer.decode([predicted_index])
-        >>> assert predicted_token == ' who'
+        assert predicted_token == ' who'
    """
    model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
    return model
@@ -144,25 +144,25 @@ def xlnetLMHeadModel(*args, **kwargs):
 #     Example:
 #         # Load the tokenizer
-#         >>> import torch
+#         import torch
-#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 #         #  Prepare tokenized input
-#         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         >>> tokenized_text1 = tokenizer.tokenize(text1)
+#         tokenized_text1 = tokenizer.tokenize(text1)
-#         >>> tokenized_text2 = tokenizer.tokenize(text2)
+#         tokenized_text2 = tokenizer.tokenize(text2)
-#         >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 #         # Load xlnetForSequenceClassification
-#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
+#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
-#         >>> model.eval()
+#         model.eval()
 #         # Predict sequence classes logits
-#         >>> with torch.no_grad():
+#         with torch.no_grad():
 #                 lm_logits, mems = model(tokens_tensor)
 #     """
 #     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)

--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
 __version__ = "1.0.0"
+from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
-from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
+from .tokenization_utils import (PreTrainedTokenizer)
+from .modeling_auto import (AutoConfig, AutoModel)
 from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
                            BertForMaskedLM, BertForNextSentencePrediction,
@@ -39,4 +42,4 @@ from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
 from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
                           WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
-from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
+from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
@@ -20,7 +20,7 @@ import argparse
 import torch
 import numpy as np
 import tensorflow as tf
-from pytorch_pretrained_bert.modeling import BertModel
+from pytorch_transformers.modeling import BertModel
 def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):

--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -38,10 +38,13 @@ except ImportError:
 try:
    from pathlib import Path
    PYTORCH_PRETRAINED_BERT_CACHE = Path(
-        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))
+        os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)))
 except (AttributeError, ImportError):
-    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE',
-                                              default_cache_path)
+                                              os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                                        default_cache_path))
+PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
@@ -70,7 +73,7 @@ def filename_to_url(filename, cache_dir=None):
    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+        cache_dir = PYTORCH_TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
@@ -98,7 +101,7 @@ def cached_path(url_or_filename, cache_dir=None):
    make sure the file exists and then return the path.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+        cache_dir = PYTORCH_TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
        url_or_filename = str(url_or_filename)
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
@@ -187,7 +190,7 @@ def get_from_cache(url, cache_dir=None):
    If it's not there, download it. Then return the path to the cached file.
    """
    if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+        cache_dir = PYTORCH_TRANSFORMERS_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
    if sys.version_info[0] == 2 and not isinstance(cache_dir, str):

--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+from __future__ import absolute_import, division, print_function, unicode_literals
+import logging
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from torch.nn.parameter import Parameter
+from .modeling_bert import BertConfig, BertModel
+from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel
+from .modeling_gpt2 import GPT2Config, GPT2Model
+from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
+from .modeling_xlnet import XLNetConfig, XLNetModel
+from .modeling_xlm import XLMConfig, XLMModel
+from .modeling_utils import PreTrainedModel, SequenceSummary
+logger = logging.getLogger(__name__)
+class AutoConfig(object):
+    r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
+        that will be instantiated as one of the configuration classes of the library
+        when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+        The `from_pretrained()` method take care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+        This class cannot be instantiated using `__init__()` (throw an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoConfig is designed to be instantiated "
+            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a one of the configuration classes of the library
+        from a pre-trained model configuration.
+        The configuration class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a saved configuration `file`.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **return_unused_kwargs**: (`optional`) bool:
+                - If False, then this function returns just the final configuration object.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
+                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
+                ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key/value pairs with which to update the configuration object after loading.
+                - The values in kwargs of any keys which are configuration attributes will be used
+                to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
+                by the `return_unused_kwargs` keyword parameter.
+        Examples::
+            config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+            config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
+        """
+        if 'bert' in pretrained_model_name_or_path:
+            return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm'".format(pretrained_model_name_or_path))
+class AutoModel(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModel` is a generic model class
+        that will be instantiated as one of the base model classes of the library
+        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+        The `from_pretrained()` method take care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+        This class cannot be instantiated using `__init__()` (throw an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModel is designed to be instantiated "
+            "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiate a one of the base model classes of the library
+        from a pre-trained model configuration.
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+            To train the model, you should first set it back in training mode with `model.train()`
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **model_args**: (`optional`) Sequence:
+                All remaning positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
+            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
+               - If a configuration is provided with `config`, **kwargs will be directly passed
+                 to the underlying model's __init__ method.
+               - If a configuration is not provided, **kwargs will be first passed to the pretrained
+                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
+                 Each key of **kwargs that corresponds to a configuration attribute
+                 will be used to override said attribute with the supplied **kwargs value.
+                 Remaining keys that do not correspond to any configuration attribute will
+                 be passed to the underlying model's __init__ function.
+        Examples::
+            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        """
+        if 'bert' in pretrained_model_name_or_path:
+            return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm'".format(pretrained_model_name_or_path))
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -643,12 +643,12 @@ class BertModel(BertPreTrainedModel):
    Examples::
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertModel(config)
+        model = BertModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+        outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config):
@@ -754,13 +754,13 @@ class BertForPreTraining(BertPreTrainedModel):
    Examples::
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForPreTraining(config)
+        model = BertForPreTraining(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+        outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_scores = outputs[:2]
+        prediction_scores, seq_relationship_scores = outputs[:2]
    """
    def __init__(self, config):
@@ -824,13 +824,13 @@ class BertForMaskedLM(BertPreTrainedModel):
    Examples::
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForMaskedLM(config)
+        model = BertForMaskedLM(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, masked_lm_labels=input_ids)
+        outputs = model(input_ids, masked_lm_labels=input_ids)
-        >>> loss, prediction_scores = outputs[:2]
+        loss, prediction_scores = outputs[:2]
    """
    def __init__(self, config):
@@ -891,13 +891,13 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
    Examples::
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForNextSentencePrediction(config)
+        model = BertForNextSentencePrediction(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+        outputs = model(input_ids)
-        >>> seq_relationship_scores = outputs[0]
+        seq_relationship_scores = outputs[0]
    """
    def __init__(self, config):
@@ -951,14 +951,14 @@ class BertForSequenceClassification(BertPreTrainedModel):
    Examples::
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForSequenceClassification(config)
+        model = BertForSequenceClassification(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
+        outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:2]
+        loss, logits = outputs[:2]
    """
    def __init__(self, config):
@@ -1057,15 +1057,15 @@ class BertForMultipleChoice(BertPreTrainedModel):
    Examples::
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForMultipleChoice(config)
+        model = BertForMultipleChoice(config)
-        >>> choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
+        outputs = model(input_ids, labels=labels)
-        >>> loss, classification_scores = outputs[:2]
+        loss, classification_scores = outputs[:2]
    """
    def __init__(self, config):
@@ -1127,14 +1127,14 @@ class BertForTokenClassification(BertPreTrainedModel):
    Examples::
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForTokenClassification(config)
+        model = BertForTokenClassification(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
+        outputs = model(input_ids, labels=labels)
-        >>> loss, scores = outputs[:2]
+        loss, scores = outputs[:2]
    """
    def __init__(self, config):
@@ -1203,15 +1203,15 @@ class BertForQuestionAnswering(BertPreTrainedModel):
    Examples::
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
+        config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForQuestionAnswering(config)
+        model = BertForQuestionAnswering(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
+        start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
+        end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss, start_scores, end_scores = outputs[:2]
+        loss, start_scores, end_scores = outputs[:2]
    """
    def __init__(self, config):

--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -137,7 +137,7 @@ class GPT2Config(PretrainedConfig):
        initializer_range=0.02,
        num_labels=1,
-        summary_type='token_ids',
+        summary_type='cls_index',
        summary_use_proj=True,
        summary_activation=None,
        summary_proj_to_labels=True,
@@ -433,12 +433,12 @@ class GPT2Model(GPT2PreTrainedModel):
    Examples::
-        >>> config = GPT2Config.from_pretrained('gpt2')
+        config = GPT2Config.from_pretrained('gpt2')
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2Model(config)
+        model = GPT2Model(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+        outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config):
@@ -567,12 +567,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
    Examples::
-        >>> config = GPT2Config.from_pretrained('gpt2')
+        config = GPT2Config.from_pretrained('gpt2')
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2LMHeadModel(config)
+        model = GPT2LMHeadModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=input_ids)
+        outputs = model(input_ids, labels=input_ids)
-        >>> loss, logits = outputs[:2]
+        loss, logits = outputs[:2]
    """
    def __init__(self, config):
@@ -683,14 +683,14 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    Examples::
-        >>> config = GPT2Config.from_pretrained('gpt2')
+        config = GPT2Config.from_pretrained('gpt2')
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2DoubleHeadsModel(config)
+        model = GPT2DoubleHeadsModel(config)
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
-        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, mc_token_ids)
+        outputs = model(input_ids, mc_token_ids)
-        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        lm_prediction_scores, mc_prediction_scores = outputs[:2]
    """
    def __init__(self, config):

--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -171,7 +171,7 @@ class OpenAIGPTConfig(PretrainedConfig):
        predict_special_tokens=True,
        num_labels=1,
-        summary_type='token_ids',
+        summary_type='cls_index',
        summary_use_proj=True,
        summary_activation=None,
        summary_proj_to_labels=True,
@@ -439,12 +439,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    Examples::
-        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        config = OpenAIGPTConfig.from_pretrained('openai-gpt')
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = OpenAIGPTModel(config)
+        model = OpenAIGPTModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+        outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config):
@@ -558,12 +558,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    Examples::
-        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        config = OpenAIGPTConfig.from_pretrained('openai-gpt')
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = OpenAIGPTLMHeadModel(config)
+        model = OpenAIGPTLMHeadModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=input_ids)
+        outputs = model(input_ids, labels=input_ids)
-        >>> loss, logits = outputs[:2]
+        loss, logits = outputs[:2]
    """
    def __init__(self, config):
@@ -665,14 +665,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    Examples::
-        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        config = OpenAIGPTConfig.from_pretrained('openai-gpt')
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = OpenAIGPTDoubleHeadsModel(config)
+        model = OpenAIGPTDoubleHeadsModel(config)
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
-        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, mc_token_ids)
+        outputs = model(input_ids, mc_token_ids)
-        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        lm_prediction_scores, mc_prediction_scores = outputs[:2]
    """
    def __init__(self, config):

--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -968,12 +968,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
    Examples::
-        >>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
+        config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
-        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        >>> model = TransfoXLModel(config)
+        model = TransfoXLModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+        outputs = model(input_ids)
-        >>> last_hidden_states, mems = outputs[:2]
+        last_hidden_states, mems = outputs[:2]
    """
    def __init__(self, config):
@@ -1284,12 +1284,12 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
    Examples::
-        >>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
+        config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
-        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        >>> model = TransfoXLLMHeadModel(config)
+        model = TransfoXLLMHeadModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+        outputs = model(input_ids)
-        >>> prediction_scores, mems = outputs[:2]
+        prediction_scores, mems = outputs[:2]
    """
    def __init__(self, config):

--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -68,8 +68,18 @@ else:
 class PretrainedConfig(object):
-    """ Base class for all configuration classes.
+    r""" Base class for all configuration classes.
-        Handle a few common parameters and methods for loading/downloading/saving configurations.
+        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
+        Class attributes (overridden by derived classes):
+            - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
+        Parameters:
+            ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
+            ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
+            ``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
+            ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
+            ``torchscript``: string, default `False`. Is the model used with Torchscript.
    """
    pretrained_config_archive_map = {}
@@ -81,8 +91,8 @@ class PretrainedConfig(object):
        self.torchscript = kwargs.pop('torchscript', False)
    def save_pretrained(self, save_directory):
-        """ Save a configuration object to a directory, so that it
+        """ Save a configuration object to the directory `save_directory`, so that it
-            can be re-loaded using the `from_pretrained(save_directory)` class method.
+            can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
        """
        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
@@ -93,41 +103,42 @@ class PretrainedConfig(object):
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
+        r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
-        Params:
+        Parameters:
-            **pretrained_model_name_or_path**: either:
+            pretrained_model_name_or_path: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved
+                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
-                - a path or url to a saved configuration `file`.
-            **cache_dir**: (`optional`) string:
+            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
-            **return_unused_kwargs**: (`optional`) bool:
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+            return_unused_kwargs: (`optional`) bool:
                - If False, then this function returns just the final configuration object.
-                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
-                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
-                ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key/value pairs with which to update the configuration object after loading.
-                - The values in kwargs of any keys which are configuration attributes will be used
-                to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                by the `return_unused_kwargs` keyword parameter.
        Examples::
-            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
-            >>> config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            # derived class: BertConfig
-            >>> config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            >>> config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            >>> assert config.output_attention == True
+            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
-            >>> config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
+            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-            >>>                                                    foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
-            >>> assert config.output_attention == True
+            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
-            >>> assert unused_kwargs == {'foo': False}
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
        """
        cache_dir = kwargs.pop('cache_dir', None)
@@ -217,14 +228,26 @@ class PretrainedConfig(object):
 class PreTrainedModel(nn.Module):
-    """ Base class for all models. Handle loading/storing model config and
+    r""" Base class for all models.
-        a simple interface for dowloading and loading pretrained models.
+        :class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+        as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
+        Class attributes (overridden by derived classes):
+            - ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
+            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
+            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
+                - ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
+                - ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
+                - ``path``: a path (string) to the TensorFlow checkpoint.
+            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
    """
-    config_class = PretrainedConfig
+    config_class = None
    pretrained_model_archive_map = {}
    load_tf_weights = lambda model, config, path: None
    base_model_prefix = ""
-    input_embeddings = None
    def __init__(self, config, *inputs, **kwargs):
        super(PreTrainedModel, self).__init__()
@@ -282,17 +305,16 @@ class PreTrainedModel(nn.Module):
    def resize_token_embeddings(self, new_num_tokens=None):
        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
-            Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
-        Args:
+        Arguments:
-            new_num_tokens: (`optional`) int
-                New number of tokens in the embedding matrix.
+            new_num_tokens: (`optional`) int:
-                Increasing the size will add newly initialized vectors at the end
+                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. 
-                Reducing the size will remove vectors from the end
+                If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
-                If not provided or None: does nothing and just returns a pointer to the input tokens Embedding Module of the model.
        Return: ``torch.nn.Embeddings``
-            Pointer to the input tokens Embedding Module of the model
+            Pointer to the input tokens Embeddings Module of the model
        """
        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
@@ -311,15 +333,17 @@ class PreTrainedModel(nn.Module):
    def prune_heads(self, heads_to_prune):
        """ Prunes heads of the base model.
-            Args:
-                heads_to_prune: dict of {layer_num (int): list of heads to prune in this layer (list of int)}
+            Arguments:
+                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
        """
        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
        base_model._prune_heads(heads_to_prune)
    def save_pretrained(self, save_directory):
-        """ Save a model with its configuration file to a directory, so that it
+        """ Save a model and its configuration file to a directory, so that it
-            can be re-loaded using the `from_pretrained(save_directory)` class method.
+            can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
        """
        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
@@ -338,58 +362,53 @@ class PreTrainedModel(nn.Module):
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
        r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
-            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are desactivated)
+        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
-            To train the model, you should first set it back in training mode with `model.train()`
+        To train the model, you should first set it back in training mode with ``model.train()``
-        Params:
+        Parameters:
-            **pretrained_model_name_or_path**: either:
+            pretrained_model_name_or_path: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
+            model_args: (`optional`) Sequence of positional arguments:
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
-            **model_args**: (`optional`) Sequence:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-                All remaning positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                Configuration can be automatically loaded when:
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
-            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
+            state_dict: (`optional`) dict:
-                from saved weights file.
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuraton but load your own weights.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
+            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-               - If a configuration is provided with `config`, **kwargs will be directly passed
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                 to the underlying model's __init__ method.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
        Examples::
-            >>> model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            >>> model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            >>> model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            >>> assert model.config.output_attention == True
+            assert model.config.output_attention == True
-            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-            >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
        config = kwargs.pop('config', None)
@@ -760,7 +779,7 @@ class SequenceSummary(nn.Module):
                - 'last' => [default] take the last token hidden state (like XLNet)
                - 'first' => take the first token hidden state (like Bert)
                - 'mean' => take the mean of all tokens hidden states
-                - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
                - 'attn' => Not implemented now, use multi-head attention
            summary_use_proj: Add a projection after the vector extraction
            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
@@ -772,7 +791,7 @@ class SequenceSummary(nn.Module):
        super(SequenceSummary, self).__init__()
        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
-        if config.summary_type == 'attn':
+        if self.summary_type == 'attn':
            # We should use a standard multi-head attention module with absolute positional embedding for that.
            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
@@ -798,11 +817,11 @@ class SequenceSummary(nn.Module):
        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
            self.last_dropout = nn.Dropout(config.summary_last_dropout)
-    def forward(self, hidden_states, token_ids=None):
+    def forward(self, hidden_states, cls_index=None):
        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
-            token_ids: [optional] index of the classification token if summary_type == 'token_ids',
+            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
-                if summary_type == 'token_ids' and token_ids is None:
+                if summary_type == 'cls_index' and cls_index is None:
                    we take the last token of the sequence as classification token
        """
        if self.summary_type == 'last':
@@ -811,14 +830,14 @@ class SequenceSummary(nn.Module):
            output = hidden_states[:, 0]
        elif self.summary_type == 'mean':
            output = hidden_states.mean(dim=1)
-        elif self.summary_type == 'token_ids':
+        elif self.summary_type == 'cls_index':
-            if token_ids is None:
+            if cls_index is None:
-                token_ids = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
+                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
            else:
-                token_ids = token_ids.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
-                token_ids = token_ids.expand((-1,) * (token_ids.dim()-1) + (hidden_states.size(-1),))
+                cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
-            # shape of token_ids: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, token_ids).squeeze(-2) # shape (bsz, XX, hidden_size)
+            output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
        elif self.summary_type == 'attn':
            raise NotImplementedError

--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -472,12 +472,12 @@ class XLMModel(XLMPreTrainedModel):
    Examples::
-        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMModel(config)
+        model = XLMModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+        outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
@@ -745,12 +745,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
    Examples::
-        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMWithLMHeadModel(config)
+        model = XLMWithLMHeadModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+        outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config):
@@ -805,14 +805,14 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
    Examples::
-        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> 
-        >>> model = XLMForSequenceClassification(config)
+        model = XLMForSequenceClassification(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
+        outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:2]
+        loss, logits = outputs[:2]
    """
    def __init__(self, config):
@@ -885,15 +885,15 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
    Examples::
-        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> 
-        >>> model = XLMForQuestionAnswering(config)
+        model = XLMForQuestionAnswering(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
+        start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
+        end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss, start_scores, end_scores = outputs[:2]
+        loss, start_scores, end_scores = outputs[:2]
    """
    def __init__(self, config):

--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -712,12 +712,12 @@ class XLNetModel(XLNetPreTrainedModel):
    Examples::
-        >>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
+        config = XLNetConfig.from_pretrained('xlnet-large-cased')
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> model = XLNetModel(config)
+        model = XLNetModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
+        outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
    def __init__(self, config):
@@ -1019,17 +1019,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
    Examples::
-        >>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
+        config = XLNetConfig.from_pretrained('xlnet-large-cased')
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> model = XLNetLMHeadModel(config)
+        model = XLNetLMHeadModel(config)
-        >>> # We show how to setup inputs to predict a next token using a bi-directional context.
+        # We show how to setup inputs to predict a next token using a bi-directional context.
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
-        >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
-        >>> perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        >>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
+        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        >>> target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
-        >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
+        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
-        >>> next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
    """
    def __init__(self, config):
@@ -1100,14 +1100,14 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
    Examples::
-        >>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
+        config = XLNetConfig.from_pretrained('xlnet-large-cased')
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> 
-        >>> model = XLNetForSequenceClassification(config)
+        model = XLNetForSequenceClassification(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
+        outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:2]
+        loss, logits = outputs[:2]
    """
    def __init__(self, config):
@@ -1200,15 +1200,15 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
    Examples::
-        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> 
-        >>> model = XLMForQuestionAnswering(config)
+        model = XLMForQuestionAnswering(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
+        start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
+        end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss, start_scores, end_scores = outputs[:2]
+        loss, start_scores, end_scores = outputs[:2]
    """
    def __init__(self, config):

--- a/pytorch_transformers/tests/modeling_auto_test.py
+++ b/pytorch_transformers/tests/modeling_auto_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import pytest
+import logging
+from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel
+from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
+class AutoModelTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+            model = AutoModel.from_pretrained(model_name)
+            model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertModel)
+            for value in loading_info.values():
+                self.assertEqual(len(value), 0)
+if __name__ == "__main__":
+    unittest.main()
--- a/pytorch_transformers/tests/tokenization_auto_test.py
+++ b/pytorch_transformers/tests/tokenization_auto_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import pytest
+import logging
+from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
+from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+class AutoTokenizerTest(unittest.TestCase):
+    def test_tokenizer_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, BertTokenizer)
+            self.assertGreater(len(tokenizer), 0)
+        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, GPT2Tokenizer)
+            self.assertGreater(len(tokenizer), 0)
+if __name__ == "__main__":
+    unittest.main()
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -24,30 +24,37 @@ from pytorch_transformers.tokenization_bert import (BasicTokenizer,
                                                    _is_control, _is_punctuation,
                                                    _is_whitespace, VOCAB_FILES_NAMES)
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
-class TokenizationTest(unittest.TestCase):
+class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
+    tokenizer_class = BertTokenizer
+    def setUp(self):
+        super(BertTokenizationTest, self).setUp()
-    def test_full_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
            "##ing", ",", "low", "lowest",
        ]
-        with TemporaryDirectory() as tmpdirname:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
-            with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-            input_text = u"UNwant\u00E9d,running"
+    def get_tokenizer(self):
-            output_text = u"unwanted, running"
+        return BertTokenizer.from_pretrained(self.tmpdirname)
-            create_and_check_tokenizer_commons(self, input_text, output_text, BertTokenizer, tmpdirname)
+    def get_input_output_texts(self):
+        input_text = u"UNwant\u00E9d,running"
+        output_text = u"unwanted, running"
+        return input_text, output_text
-            tokenizer = BertTokenizer(vocab_file)
+    def test_full_tokenizer(self):
+        tokenizer = BertTokenizer(self.vocab_file)
-            tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
-            self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-            self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
    def test_chinese(self):
        tokenizer = BasicTokenizer()

--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -20,42 +20,49 @@ import json
 from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
-class GPT2TokenizationTest(unittest.TestCase):
+class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
-    def test_full_tokenizer(self):
+    tokenizer_class = GPT2Tokenizer
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+    def setUp(self):
+        super(GPT2TokenizationTest, self).setUp()
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                 "lo", "low", "er",
                 "low", "lowest", "newer", "wider", "<unk>"]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
-        special_tokens_map = {"unk_token": "<unk>"}
+        self.special_tokens_map = {"unk_token": "<unk>"}
-        with TemporaryDirectory() as tmpdirname:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
-            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
-            with open(vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
-                fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
-            with open(merges_file, "w") as fp:
+            fp.write("\n".join(merges))
-                fp.write("\n".join(merges))
+    def get_tokenizer(self):
-            input_text = u"lower newer"
+        return GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
-            output_text = u"lower<unk>newer"
+    def get_input_output_texts(self):
-            create_and_check_tokenizer_commons(self, input_text, output_text, GPT2Tokenizer, tmpdirname, **special_tokens_map)
+        input_text = u"lower newer"
+        output_text = u"lower<unk>newer"
-            tokenizer = GPT2Tokenizer(vocab_file, merges_file, **special_tokens_map)
+        return input_text, output_text
-            text = "lower"
-            bpe_tokens = ["low", "er"]
+    def test_full_tokenizer(self):
-            tokens = tokenizer.tokenize(text)
+        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-            self.assertListEqual(tokens, bpe_tokens)
+        text = "lower"
+        bpe_tokens = ["low", "er"]
-            input_tokens = tokens + [tokenizer.unk_token]
+        tokens = tokenizer.tokenize(text)
-            input_bpe_tokens = [13, 12, 17]
+        self.assertListEqual(tokens, bpe_tokens)
-            self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [13, 12, 17]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 if __name__ == '__main__':

--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -20,13 +20,17 @@ import json
 from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
-class OpenAIGPTTokenizationTest(unittest.TestCase):
+class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
-    def test_full_tokenizer(self):
+    tokenizer_class = OpenAIGPTTokenizer
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+    def setUp(self):
+        super(OpenAIGPTTokenizationTest, self).setUp()
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                 "w</w>", "r</w>", "t</w>",
                 "lo", "low", "er</w>",
@@ -34,30 +38,34 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
-        with TemporaryDirectory() as tmpdirname:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
-            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
-            with open(vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
-                fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
-            with open(merges_file, "w") as fp:
+            fp.write("\n".join(merges))
-                fp.write("\n".join(merges))
-            input_text = u"lower newer"
+    def get_tokenizer(self):
-            output_text = u"lower newer"
+        return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname)
-            create_and_check_tokenizer_commons(self, input_text, output_text, OpenAIGPTTokenizer, tmpdirname)
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u"lower newer"
+        return input_text, output_text
-            tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file)
-            text = "lower"
+    def test_full_tokenizer(self):
-            bpe_tokens = ["low", "er</w>"]
+        tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
-            tokens = tokenizer.tokenize(text)
-            self.assertListEqual(tokens, bpe_tokens)
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
-            input_tokens = tokens + ["<unk>"]
+        input_tokens = tokens + ["<unk>"]
-            input_bpe_tokens = [14, 15, 20]
+        input_bpe_tokens = [14, 15, 20]
-            self.assertListEqual(
+        self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 if __name__ == '__main__':

--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -19,6 +19,7 @@ import sys
 from io import open
 import tempfile
 import shutil
+import unittest
 if sys.version_info[0] == 2:
    import cPickle as pickle
@@ -36,113 +37,124 @@ else:
    unicode = str
-def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+class CommonTestCases:
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
-    before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+    class CommonTokenizerTester(unittest.TestCase):
-    with TemporaryDirectory() as tmpdirname:
+        tokenizer_class = None
-        tokenizer.save_pretrained(tmpdirname)
-        tokenizer = tokenizer.from_pretrained(tmpdirname)
-    after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+        def setUp(self):
-    tester.assertListEqual(before_tokens, after_tokens)
+            self.tmpdirname = tempfile.mkdtemp()
-def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+        def tearDown(self):
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+            shutil.rmtree(self.tmpdirname)
-    tester.assertIsNotNone(tokenizer)
-    text = u"Munich and Berlin are nice cities"
+        def get_tokenizer(self):
-    subwords = tokenizer.tokenize(text)
+            raise NotImplementedError
-    with TemporaryDirectory() as tmpdirname:
+        def get_input_output_texts(self):
+            raise NotImplementedError
-        filename = os.path.join(tmpdirname, u"tokenizer.bin")
+        def test_save_and_load_tokenizer(self):
-        pickle.dump(tokenizer, open(filename, "wb"))
+            tokenizer = self.get_tokenizer()
-        tokenizer_new = pickle.load(open(filename, "rb"))
+            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
-    subwords_loaded = tokenizer_new.tokenize(text)
+            with TemporaryDirectory() as tmpdirname:
+                tokenizer.save_pretrained(tmpdirname)
+                tokenizer = tokenizer.from_pretrained(tmpdirname)
-    tester.assertListEqual(subwords, subwords_loaded)
+            after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+            self.assertListEqual(before_tokens, after_tokens)
+        def test_pickle_tokenizer(self):
+            tokenizer = self.get_tokenizer()
+            self.assertIsNotNone(tokenizer)
-def create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
+            text = u"Munich and Berlin are nice cities"
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+            subwords = tokenizer.tokenize(text)
-    vocab_size = tokenizer.vocab_size
+            with TemporaryDirectory() as tmpdirname:
-    all_size = len(tokenizer)
-    tester.assertNotEqual(vocab_size, 0)
+                filename = os.path.join(tmpdirname, u"tokenizer.bin")
-    tester.assertEqual(vocab_size, all_size)
+                pickle.dump(tokenizer, open(filename, "wb"))
-    new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
+                tokenizer_new = pickle.load(open(filename, "rb"))
-    added_toks = tokenizer.add_tokens(new_toks)
-    vocab_size_2 = tokenizer.vocab_size
-    all_size_2 = len(tokenizer)
-    tester.assertNotEqual(vocab_size_2, 0)
+            subwords_loaded = tokenizer_new.tokenize(text)
-    tester.assertEqual(vocab_size, vocab_size_2)
-    tester.assertEqual(added_toks, len(new_toks))
-    tester.assertEqual(all_size_2, all_size + len(new_toks))
-    tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
+            self.assertListEqual(subwords, subwords_loaded)
-    tester.assertGreaterEqual(len(tokens), 4)
-    tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-    tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-    new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-                  'pad_token': "<<<<<|||>|>>>>|>"}
-    added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-    vocab_size_3 = tokenizer.vocab_size
-    all_size_3 = len(tokenizer)
-    tester.assertNotEqual(vocab_size_3, 0)
+        def test_add_tokens_tokenizer(self):
-    tester.assertEqual(vocab_size, vocab_size_3)
+            tokenizer = self.get_tokenizer()
-    tester.assertEqual(added_toks_2, len(new_toks_2))
-    tester.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-    tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
+            vocab_size = tokenizer.vocab_size
+            all_size = len(tokenizer)
-    tester.assertGreaterEqual(len(tokens), 6)
+            self.assertNotEqual(vocab_size, 0)
-    tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+            self.assertEqual(vocab_size, all_size)
-    tester.assertGreater(tokens[0], tokens[1])
-    tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-    tester.assertGreater(tokens[-2], tokens[-3])
-    tester.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
-    tester.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
+            new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
+            added_toks = tokenizer.add_tokens(new_toks)
+            vocab_size_2 = tokenizer.vocab_size
+            all_size_2 = len(tokenizer)
-def create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
+            self.assertNotEqual(vocab_size_2, 0)
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+            self.assertEqual(vocab_size, vocab_size_2)
+            self.assertEqual(added_toks, len(new_toks))
+            self.assertEqual(all_size_2, all_size + len(new_toks))
-    tokens = tokenizer.tokenize(input_text)
+            tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
-    ids = tokenizer.convert_tokens_to_ids(tokens)
+            self.assertGreaterEqual(len(tokens), 4)
-    ids_2 = tokenizer.encode(input_text)
+            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-    tester.assertListEqual(ids, ids_2)
+            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-    tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+            new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-    text_2 = tokenizer.decode(ids)
+                        'pad_token': "<<<<<|||>|>>>>|>"}
+            added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+            vocab_size_3 = tokenizer.vocab_size
+            all_size_3 = len(tokenizer)
-    tester.assertEqual(text_2, output_text)
+            self.assertNotEqual(vocab_size_3, 0)
+            self.assertEqual(vocab_size, vocab_size_3)
+            self.assertEqual(added_toks_2, len(new_toks_2))
+            self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
-    tester.assertNotEqual(len(tokens_2), 0)
+            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
-    tester.assertIsInstance(text_2, (str, unicode))
+            self.assertGreaterEqual(len(tokens), 6)
+            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+            self.assertGreater(tokens[0], tokens[1])
+            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+            self.assertGreater(tokens[-2], tokens[-3])
+            self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
+            self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
-def create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
-    weights_list = list(tokenizer_class.max_model_input_sizes.keys())
-    weights_lists_2 = []
-    for file_id, map_list in tokenizer_class.pretrained_vocab_files_map.items():
-        weights_lists_2.append(list(map_list.keys()))
-    for weights_list_2 in weights_lists_2:
+        def test_required_methods_tokenizer(self):
-        tester.assertListEqual(weights_list, weights_list_2)
+            tokenizer = self.get_tokenizer()
+            input_text, output_text = self.get_input_output_texts()
+            tokens = tokenizer.tokenize(input_text)
+            ids = tokenizer.convert_tokens_to_ids(tokens)
+            ids_2 = tokenizer.encode(input_text)
+            self.assertListEqual(ids, ids_2)
-def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
+            tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-    create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
+            text_2 = tokenizer.decode(ids)
-    create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
-    create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
+            self.assertEqual(text_2, output_text)
-    create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
-    create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
+            self.assertNotEqual(len(tokens_2), 0)
+            self.assertIsInstance(text_2, (str, unicode))
+        def test_pretrained_model_lists(self):
+            weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
+            weights_lists_2 = []
+            for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
+                weights_lists_2.append(list(map_list.keys()))
+            for weights_list_2 in weights_lists_2:
+                self.assertListEqual(weights_list, weights_list_2)
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -20,32 +20,39 @@ from io import open
 from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
-from.tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from.tokenization_tests_commons import CommonTestCases
-class TransfoXLTokenizationTest(unittest.TestCase):
+class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
+    tokenizer_class = TransfoXLTokenizer
+    def setUp(self):
+        super(TransfoXLTokenizationTest, self).setUp()
-    def test_full_tokenizer(self):
        vocab_tokens = [
            "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
            "running", ",", "low", "l",
        ]
-        with TemporaryDirectory() as tmpdirname:
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
-            with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-            input_text = u"<unk> UNwanted , running"
+    def get_tokenizer(self):
-            output_text = u"<unk> unwanted, running"
+        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True)
-            create_and_check_tokenizer_commons(self, input_text, output_text, TransfoXLTokenizer, tmpdirname, lower_case=True)
+    def get_input_output_texts(self):
+        input_text = u"<unk> UNwanted , running"
+        output_text = u"<unk> unwanted, running"
+        return input_text, output_text
-            tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
+    def test_full_tokenizer(self):
+        tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
-            tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
+        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
-            self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
+        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
-            self.assertListEqual(
+        self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
    def test_full_tokenizer_lower(self):
        tokenizer = TransfoXLTokenizer(lower_case=True)