Merge pull request #1110 from huggingface/automodels

Torch.hub now based on AutoModels - Updating AutoModels with AutoModelWithLMHead, Sequence Classification and Question Answering

Merge pull request #1110 from huggingface/automodels
Torch.hub now based on AutoModels - Updating AutoModels with AutoModelWithLMHead, Sequence Classification and Question Answering
12b9cc9e · Thomas Wolf · GitHub · 282c276e · bfe93a5a · 12b9cc9e
Unverified Commit 12b9cc9e authored Aug 30, 2019 by Thomas Wolf Committed by GitHub Aug 30, 2019
12 changed files
--- a/hubconf.py
+++ b/hubconf.py
-dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
-
-from hubconfs.bert_hubconf import (
-    bertTokenizer,
-    bertModel,
-    bertForNextSentencePrediction,
-    bertForPreTraining,
-    bertForMaskedLM,
-    bertForSequenceClassification,
-    bertForMultipleChoice,
-    bertForQuestionAnswering,
-    bertForTokenClassification
-)
-from hubconfs.gpt_hubconf import (
-    openAIGPTTokenizer,
-    openAIGPTModel,
-    openAIGPTLMHeadModel,
-    openAIGPTDoubleHeadsModel
-)
-from hubconfs.gpt2_hubconf import (
-    gpt2Tokenizer,
-    gpt2Model,
-    gpt2LMHeadModel,
-    gpt2DoubleHeadsModel
-)
-from hubconfs.transformer_xl_hubconf import (
-    transformerXLTokenizer,
-    transformerXLModel,
-    transformerXLLMHeadModel
+from pytorch_transformers import (
+    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
 )
+from pytorch_transformers.modeling_utils import add_start_docstrings
+
+dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
+
+@add_start_docstrings(AutoConfig.__doc__)
+def config(*args, **kwargs):
+    r""" 
+                # Using torch.hub !
+                import torch
+
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
+                assert config.output_attention == True
+                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
+                assert config.output_attention == True
+                assert unused_kwargs == {'foo': False}
+
+            """
+
+    return AutoConfig.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoTokenizer.__doc__)
+def tokenizer(*args, **kwargs):
+    r""" 
+        # Using torch.hub !
+        import torch
+
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+
+    """
+
+    return AutoTokenizer.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoModel.__doc__)
+def model(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
+
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+
+    return AutoModel.from_pretrained(*args, **kwargs)
+
+@add_start_docstrings(AutoModelWithLMHead.__doc__)
+def modelWithLMHead(*args, **kwargs):
+    r"""
+        # Using torch.hub !
+        import torch
+
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+    """
+    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
+def modelForSequenceClassification(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
+
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+
+    return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
+
+
+@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
+def modelForQuestionAnswering(*args, **kwargs):
+    r"""
+        # Using torch.hub !
+        import torch
+
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+    """
+    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
-from pytorch_transformers.tokenization_bert import BertTokenizer
-from pytorch_transformers.modeling_bert import (
-        BertModel,
-        BertForNextSentencePrediction,
-        BertForMaskedLM,
-        BertForMultipleChoice,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        )
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-bert_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-large-cased`
-                . `bert-base-multilingual-uncased`
-                . `bert-base-multilingual-cased`
-                . `bert-base-chinese`
-                . `bert-base-german-cased`
-                . `bert-large-uncased-whole-word-masking`
-                . `bert-large-cased-whole-word-masking`
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
-                  instance
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow
-                 checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models
-                   will be cached.
-        state_dict: an optional state dictionary
-                    (collections.OrderedDict object) to use instead of Google
-                    pre-trained models
-        *inputs, **kwargs: additional input for the specific Bert class
-            (ex: num_labels for BertForSequenceClassification)
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def bertTokenizer(*args, **kwargs):
-    """
-    Instantiate a BertTokenizer from a pre-trained/customized vocab file
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * bert-base-uncased
-                                       * bert-large-uncased
-                                       * bert-base-cased
-                                       * bert-large-cased
-                                       * bert-base-multilingual-uncased
-                                       * bert-base-multilingual-cased
-                                       * bert-base-chinese
-    Keyword args:
-    cache_dir: an optional path to a specific directory to download and cache
-               the pre-trained model weights.
-               Default: None
-    do_lower_case: Whether to lower case the input.
-                   Only has an effect when do_wordpiece_only=False
-                   Default: True
-    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-                       Default: True
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-             Default: None
-    never_split: List of tokens which will never be split during tokenization.
-                 Only has an effect when do_wordpiece_only=False
-                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
-
-    Example:
-        import torch
-        sentence = 'Hello, World!'
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        toks = tokenizer.tokenize(sentence)
-        ['Hello', '##,', 'World', '##!']
-        ids = tokenizer.convert_tokens_to_ids(toks)
-        [8667, 28136, 1291, 28125]
-    """
-    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertModel(*args, **kwargs):
-    """
-    BertModel is the basic BERT Transformer model with a layer of summed token,
-    position and sequence embeddings followed by a series of identical
-    self-attention blocks (12 for BERT-base, 24 for BERT-large).
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                encoded_layers, _ = model(tokens_tensor, segments_tensors)
-    """
-    model = BertModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForNextSentencePrediction(*args, **kwargs):
-    """
-    BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence
-    classification head.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForNextSentencePrediction
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
-        model.eval()
-        # Predict the next sentence classification logits
-        with torch.no_grad():
-                next_sent_classif_logits = model(tokens_tensor, segments_tensors)
-    """
-    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForPreTraining(*args, **kwargs):
-    """
-    BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads
-        - the masked language modeling head, and
-        - the next sentence classification head.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForPreTraining
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
-        masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
-    """
-    model = BertForPreTraining.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMaskedLM(*args, **kwargs):
-    """
-    BertForMaskedLM includes the BertModel Transformer followed by the
-    (possibly) pre-trained masked language modeling head.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        masked_index = 8
-        tokenized_text[masked_index] = '[MASK]'
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForMaskedLM
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
-        model.eval()
-        # Predict all tokens
-        with torch.no_grad():
-                predictions = model(tokens_tensor, segments_tensors)
-        predicted_index = torch.argmax(predictions[0, masked_index]).item()
-        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        'henson'
-    """
-    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForSequenceClassification(*args, **kwargs):
-    """
-    BertForSequenceClassification is a fine-tuning model that includes
-    BertModel and a sequence-level (sequence or pair of sequences) classifier
-    on top of the BertModel. Note that the classification head is only initialized
-    and has to be trained.
-
-    The sequence-level classifier is a linear layer that takes as input the
-    last hidden state of the first character in the input sequence
-    (see Figures 3a and 3b in the BERT paper).
-
-    Args:
-    num_labels: the number (>=2) of classes for the classifier.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForSequenceClassification
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
-        model.eval()
-        # Predict the sequence classification logits
-        with torch.no_grad():
-                seq_classif_logits = model(tokens_tensor, segments_tensors)
-        # Or get the sequence classification loss
-        labels = torch.tensor([1])
-        seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMultipleChoice(*args, **kwargs):
-    """
-    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
-    linear layer on top of the BertModel. Note that the multiple choice head is
-    only initialized and has to be trained.
-
-    Args:
-    num_choices: the number (>=2) of classes for the classifier.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
-        segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
-        # Load bertForMultipleChoice
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
-        model.eval()
-        # Predict the multiple choice logits
-        with torch.no_grad():
-                multiple_choice_logits = model(tokens_tensor, segments_tensors)
-        # Or get the multiple choice loss
-        labels = torch.tensor([1])
-        multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForQuestionAnswering(*args, **kwargs):
-    """
-    BertForQuestionAnswering is a fine-tuning model that includes BertModel
-    with a token-level classifiers on top of the full sequence of last hidden
-    states. Note that the classification head is only initialized
-    and has to be trained.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForQuestionAnswering
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
-        model.eval()
-        # Predict the start and end positions logits
-        with torch.no_grad():
-                start_logits, end_logits = model(tokens_tensor, segments_tensors)
-        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
-        start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
-        # set model.train() before if training this loss
-        multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
-    """
-    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForTokenClassification(*args, **kwargs):
-    """
-    BertForTokenClassification is a fine-tuning model that includes BertModel
-    and a token-level classifier on top of the BertModel. Note that the classification
-    head is only initialized and has to be trained.
-
-    The token-level classifier is a linear layer that takes as input the last
-    hidden state of the sequence.
-
-    Args:
-    num_labels: the number (>=2) of classes for the classifier.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForTokenClassification
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
-        model.eval()
-        # Predict the token classification logits
-        with torch.no_grad():
-                classif_logits = model(tokens_tensor, segments_tensors)
-        # Or get the token classification loss
-        labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
-        classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
-    return model
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
-from pytorch_transformers.modeling_gpt2 import (
-    GPT2Model,
-    GPT2LMHeadModel,
-    GPT2DoubleHeadsModel
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-gpt2_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `gpt2`, `gpt2-medium`
-            - a path or url to a pretrained model archive containing:
-                . `gpt2_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
-            - a path or url to a pretrained model archive containing:
-                . `gpt2_config.json` a configuration file for the model
-                . a TensorFlow checkpoint with trained weights
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific GPT-2 class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def gpt2Tokenizer(*args, **kwargs):
-    """
-    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
-    Peculiarities:
-        - Byte-level BPE
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * gpt2
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-             Default: None
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2Model(*args, **kwargs):
-    """
-    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
-    identical stacked masked self-attention blocks and pre-trained
-    on large scale dataset using language modeling signal.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load gpt2Model
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # past can be used to reuse precomputed hidden state in a subsequent predictions
-        with torch.no_grad():
-                hidden_states_1, past = model(tokens_tensor_1)
-                hidden_states_2, past = model(tokens_tensor_2, past=past)
-    """
-    model = GPT2Model.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2LMHeadModel(*args, **kwargs):
-    """
-    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
-    tied (pre-trained) language modeling head on top.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load gpt2LMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # past can be used to reuse precomputed hidden state in a subsequent predictions
-        with torch.no_grad():
-                predictions_1, past = model(tokens_tensor_1)
-                predictions_2, past = model(tokens_tensor_2, past=past)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2DoubleHeadsModel(*args, **kwargs):
-    """
-    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
-    tied (pre-trained) language modeling head and a multiple choice
-    classification head (only initialized, not pre-trained).
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-
-        #  Prepare tokenized input
-        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        tokenized_text1 = tokenizer.tokenize(text1)
-        tokenized_text2 = tokenizer.tokenize(text2)
-        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-        # Load gpt2DoubleHeadsModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
-    """
-    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
-    return model
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
-from pytorch_transformers.modeling_openai import (
-	OpenAIGPTModel,
-	OpenAIGPTLMHeadModel,
-	OpenAIGPTDoubleHeadsModel
-)
-
-# Dependecies that are not specified in global hubconf.py
-specific_dependencies = ['spacy', 'ftfy']
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-gpt_docstring = """
-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
-    The embeddings are ordered as follow in the token embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
-
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
-
-    Params:
-		pretrained_model_name_or_path: either:
-			- a str with the name of a pre-trained model to load selected in the list of:
-				. `openai-gpt`
-			- a path or url to a pretrained model archive containing:
-				. `openai_gpt_config.json` a configuration file for the model
-				. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
-			- a path or url to a pretrained model archive containing:
-				. `openai-gpt-config.json` a configuration file for the model
-				. a series of NumPy files containing OpenAI TensorFlow trained weights
-		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-		state_dict: an optional state dictionary (collections.OrderedDict object)
-		        	to use instead of pre-trained models
-		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def openAIGPTTokenizer(*args, **kwargs):
-    """
-    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
-	Peculiarities:
-        - lower case all inputs
-        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
-        - argument special_tokens and function set_special_tokens:
-            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * openai-gpt
-    Keyword args:
-	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
-					Default: None
-	max_len: An artificial maximum length to truncate tokenized sequences to;
-        	 Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-			 Default: None
-
-    Example:
-		import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-		
-		text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
-    """
-    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTModel(*args, **kwargs):
-    """
-    OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
-	identical stacked masked self-attention blocks and pre-trained
-	on large scale dataset using language modeling signal.
-
-    Example:
-        # Load the tokenizer
-		import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-
-        #  Prepare tokenized input
-        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        tokens_tensor = torch.tensor([indexed_tokens])
-
-        # Load openAIGPTModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states = model(tokens_tensor)
-    """
-    model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTLMHeadModel(*args, **kwargs):
-    """
-    OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
-	tied (pre-trained) language modeling head on top.
-
-	Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-
-        #  Prepare tokenized input
-        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        tokens_tensor = torch.tensor([indexed_tokens])
-
-        # Load openAIGPTLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions = model(tokens_tensor)
-
-		# Get the predicted last token
-		predicted_index = torch.argmax(predictions[0, -1, :]).item()
-		predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        '.</w>'
-    """
-    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTDoubleHeadsModel(*args, **kwargs):
-    """
-    OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
-	tied (pre-trained) language modeling head and a multiple choice
-	classification head (only initialized, not pre-trained).
-
-	Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-
-        #  Prepare tokenized input
-        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        tokenized_text1 = tokenizer.tokenize(text1)
-        tokenized_text2 = tokenizer.tokenize(text2)
-        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-        # Load openAIGPTDoubleHeadsModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
-    """
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
-    return model
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
-from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
-from pytorch_transformers.modeling_transfo_xl import (
-    TransfoXLModel,
-    TransfoXLLMHeadModel
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-transformer_xl_docstring = """
-    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
-    - you don't need to specify positioning embeddings indices
-    - the tokens in the vocabulary have to be sorted to decreasing frequency.
-
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `transfo-xl-wt103`
-            - a path or url to a pretrained model archive containing:
-                . `transfo_xl_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
-            - a path or url to a pretrained model archive containing:
-                . `transfo_xl_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific TransformerXL class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def transformerXLTokenizer(*args, **kwargs):
-    """
-    Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * transfo-xl-wt103
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-        
-        text = "Who was Jim Henson ?"
-        tokenized_text = tokenizer.tokenize(tokenized_text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-    """
-    tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(transformer_xl_docstring)
-def transformerXLModel(*args, **kwargs):
-    """
-    transformerXLModel is the basic Transformer XL model.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        tokenized_text_1 = tokenizer.tokenize(text_1)
-        tokenized_text_2 = tokenizer.tokenize(text_2)
-        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load transformerXLModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # We can re-use the memory cells in a subsequent call to attend a longer context
-        with torch.no_grad():
-                hidden_states_1, mems_1 = model(tokens_tensor_1)
-                hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
-    """
-    model = TransfoXLModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(transformer_xl_docstring)
-def transformerXLLMHeadModel(*args, **kwargs):
-    """
-    transformerXLModel is the basic Transformer XL model with the
-    tied (pre-trained) language modeling head on top.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        tokenized_text_1 = tokenizer.tokenize(text_1)
-        tokenized_text_2 = tokenizer.tokenize(text_2)
-        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load transformerXLLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        # We can re-use the memory cells in a subsequent call to attend a longer context
-        with torch.no_grad():
-                predictions_1, mems_1 = model(tokens_tensor_1)
-                predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        assert predicted_token == 'who'
-    """
-    model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
--- a/hubconfs/xlm_hubconf.py
+++ b/hubconfs/xlm_hubconf.py
-from pytorch_transformers.tokenization_xlm import XLMTokenizer
-from pytorch_transformers.modeling_xlm import (
-    XLMConfig,
-    XLMModel,
-    XLMWithLMHeadModel,
-    XLMForSequenceClassification,
-    XLMForQuestionAnswering
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlm_start_docstring = """
-    Model class adapted from the XLM Transformer model of
-        "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
-        Paper: https://arxiv.org/abs/1901.07291
-        Original code: https://github.com/facebookresearch/XLM
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-"""
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlm_end_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `xlm-mlm-en-2048`
-            - a path or url to a pretrained model archive containing:
-                . `config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific XLM class
-"""
-
-
-def _begin_with_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-def _end_with_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def xlmTokenizer(*args, **kwargs):
-    """
-    Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * xlm-mlm-en-2048
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying model's
-             sequence length.
-             Default: None
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
-
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_begin_with_docstring(xlm_start_docstring)
-@_end_with_docstring(xlm_end_docstring)
-def xlmModel(*args, **kwargs):
-    """
-        # Load xlmModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states_1, mems = model(tokens_tensor_1)
-                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
-    """
-    model = XLMModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_begin_with_docstring(xlm_start_docstring)
-@_end_with_docstring(xlm_end_docstring)
-def xlmLMHeadModel(*args, **kwargs):
-    """
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load xlnetLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions_1, mems = model(tokens_tensor_1)
-                predictions_2, mems = model(tokens_tensor_2, mems=mems)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-# @_end_with_docstring(xlnet_docstring)
-# def xlnetForSequenceClassification(*args, **kwargs):
-#     """
-#     xlnetModel is the basic XLNet Transformer model from
-#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-
-#     Example:
-#         # Load the tokenizer
-#         import torch
-#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
-
-#         #  Prepare tokenized input
-#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         tokenized_text1 = tokenizer.tokenize(text1)
-#         tokenized_text2 = tokenizer.tokenize(text2)
-#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-#         # Load xlnetForSequenceClassification
-#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
-#         model.eval()
-
-#         # Predict sequence classes logits
-#         with torch.no_grad():
-#                 lm_logits, mems = model(tokens_tensor)
-#     """
-#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
-#     return model
--- a/hubconfs/xlnet_hubconf.1.py
+++ b/hubconfs/xlnet_hubconf.1.py
-from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
-from pytorch_transformers.modeling_xlnet import (
-    XLNetConfig,
-    XLNetModel,
-    XLNetLMHeadModel,
-    # XLNetForSequenceClassification
-)
-
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlnet_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `xlnet-large-cased`
-            - a path or url to a pretrained model archive containing:
-                . `config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
-            - a path or url to a pretrained model archive containing:
-                . `xlnet_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific XLNet class
-"""
-
-
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-
-
-def xlnetTokenizer(*args, **kwargs):
-    """
-    Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
-    Peculiarities:
-        - require Google sentencepiece (https://github.com/google/sentencepiece)
-
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * xlnet-large-cased
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying model's
-             sequence length.
-             Default: None
-
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-
-
-@_append_from_pretrained_docstring(xlnet_docstring)
-def xlnetModel(*args, **kwargs):
-    """
-    xlnetModel is the basic XLNet Transformer model from
-        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load xlnetModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states_1, mems = model(tokens_tensor_1)
-                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
-    """
-    model = XLNetModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-@_append_from_pretrained_docstring(xlnet_docstring)
-def xlnetLMHeadModel(*args, **kwargs):
-    """
-    xlnetModel is the basic XLNet Transformer model from
-        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-    with a tied (pre-trained) language modeling head on top.
-
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-
-        # Load xlnetLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
-        model.eval()
-
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions_1, mems = model(tokens_tensor_1)
-                predictions_2, mems = model(tokens_tensor_2, mems=mems)
-
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-
-
-# @_append_from_pretrained_docstring(xlnet_docstring)
-# def xlnetForSequenceClassification(*args, **kwargs):
-#     """
-#     xlnetModel is the basic XLNet Transformer model from
-#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-
-#     Example:
-#         # Load the tokenizer
-#         import torch
-#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-
-#         #  Prepare tokenized input
-#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         tokenized_text1 = tokenizer.tokenize(text1)
-#         tokenized_text2 = tokenizer.tokenize(text2)
-#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-
-#         # Load xlnetForSequenceClassification
-#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
-#         model.eval()
-
-#         # Predict sequence classes logits
-#         with torch.no_grad():
-#                 lm_logits, mems = model(tokens_tensor)
-#     """
-#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
-#     return model
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -11,7 +11,8 @@ from .tokenization_distilbert import DistilBertTokenizer

 from .tokenization_utils import (PreTrainedTokenizer)

-from .modeling_auto import (AutoConfig, AutoModel)
+from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
+                            AutoModelWithLMHead)

 from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
                            BertForMaskedLM, BertForNextSentencePrediction,

--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -18,24 +18,20 @@ from __future__ import absolute_import, division, print_function, unicode_litera

 import logging

-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss, MSELoss
-from torch.nn.parameter import Parameter
-
-from .modeling_bert import BertConfig, BertModel
-from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel
-from .modeling_gpt2 import GPT2Config, GPT2Model
-from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
-from .modeling_xlnet import XLNetConfig, XLNetModel
-from .modeling_xlm import XLMConfig, XLMModel
-from .modeling_roberta import RobertaConfig, RobertaModel
-from .modeling_distilbert import DistilBertConfig, DistilBertModel
-
-from .modeling_utils import PreTrainedModel, SequenceSummary
+from .modeling_bert import BertConfig, BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
+from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTLMHeadModel
+from .modeling_gpt2 import GPT2Config, GPT2Model, GPT2LMHeadModel
+from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel
+from .modeling_xlnet import XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
+from .modeling_xlm import XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
+from .modeling_roberta import RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
+from .modeling_distilbert import DistilBertConfig, DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
+
+from .modeling_utils import PreTrainedModel, SequenceSummary, add_start_docstrings

 logger = logging.getLogger(__name__)

+
 class AutoConfig(object):
    r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
        that will be instantiated as one of the configuration classes of the library
@@ -47,6 +43,7 @@ class AutoConfig(object):

        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertConfig (DistilBERT model)
            - contains `bert`: BertConfig (Bert model)
            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
@@ -68,6 +65,7 @@ class AutoConfig(object):

        The configuration class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertConfig (DistilBERT model)
            - contains `bert`: BertConfig (Bert model)
            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
@@ -77,26 +75,32 @@ class AutoConfig(object):
            - contains `roberta`: RobertaConfig (RoBERTa model)

        Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a saved configuration `file`.
-            **cache_dir**: (`optional`) string:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
+            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
-            **return_unused_kwargs**: (`optional`) bool:
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
                - If False, then this function returns just the final configuration object.
-                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
-                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
-                ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key/value pairs with which to update the configuration object after loading.
-                - The values in kwargs of any keys which are configuration attributes will be used
-                to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                by the `return_unused_kwargs` keyword parameter.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.

        Examples::

@@ -140,20 +144,21 @@ class AutoModel(object):
        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
        class method.

-        The `from_pretrained()` method take care of returning the correct model class instance
+        The `from_pretrained()` method takes care of returning the correct model class instance
        using pattern matching on the `pretrained_model_name_or_path` string.

        The base model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
+            - contains `roberta`: RobertaModel (RoBERTa model)
            - contains `bert`: BertModel (Bert model)
            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
            - contains `xlnet`: XLNetModel (XLNet model)
            - contains `xlm`: XLMModel (XLM model)
-            - contains `roberta`: RobertaModel (RoBERTa model)

-        This class cannot be instantiated using `__init__()` (throw an error).
+        This class cannot be instantiated using `__init__()` (throws an error).
    """
    def __init__(self):
        raise EnvironmentError("AutoModel is designed to be instantiated "
@@ -161,61 +166,64 @@ class AutoModel(object):

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiate a one of the base model classes of the library
+        r""" Instantiates one of the base model classes of the library
        from a pre-trained model configuration.

-        The base model class to instantiate is selected as the first pattern matching
+        The model class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
+            - contains `roberta`: RobertaModel (RoBERTa model)
            - contains `bert`: BertModel (Bert model)
            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
            - contains `xlnet`: XLNetModel (XLNet model)
            - contains `xlm`: XLMModel (XLM model)
-            - contains `roberta`: RobertaModel (RoBERTa model)

            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
            To train the model, you should first set it back in training mode with `model.train()`

        Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaning positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
-            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
-                from saved weights file.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.

        Examples::

@@ -248,3 +256,345 @@ class AutoModel(object):
        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class AutoModelWithLMHead(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelWithLMHead` is a generic model class
+        that will be instantiated as one of the language modeling model classes of the library
+        when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
+            - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: BertForMaskedLM (Bert model)
+            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
+            - contains `xlm`: XLMWithLMHeadModel (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the language modeling model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
+            - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
+            - contains `bert`: BertForMaskedLM (Bert model)
+            - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model)
+            - contains `xlnet`: XLNetLMHeadModel (XLNet model)
+            - contains `xlm`: XLMWithLMHeadModel (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModelWithLMHead.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelWithLMHead.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class AutoModelForSequenceClassification(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a generic model class
+        that will be instantiated as one of the sequence classification model classes of the library
+        when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
+            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: BertForSequenceClassification (Bert model)
+            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: XLMForSequenceClassification (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the sequence classification model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
+            - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
+            - contains `bert`: BertForSequenceClassification (Bert model)
+            - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
+            - contains `xlm`: XLMForSequenceClassification (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+
+class AutoModelForQuestionAnswering(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModelForQuestionAnswering` is a generic model class
+        that will be instantiated as one of the question answering model classes of the library
+        when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
+            - contains `bert`: BertForQuestionAnswering (Bert model)
+            - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: XLMForQuestionAnswering (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated "
+            "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the question answering model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model)
+            - contains `bert`: BertForQuestionAnswering (Bert model)
+            - contains `xlnet`: XLNetForQuestionAnswering (XLNet model)
+            - contains `xlm`: XLMForQuestionAnswering (XLM model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
+            return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path))
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -59,6 +59,12 @@ if not six.PY2:
            fn.__doc__ = ''.join(docstr) + fn.__doc__
            return fn
        return docstring_decorator
+
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            fn.__doc__ = fn.__doc__ + ''.join(docstr)
+            return fn
+        return docstring_decorator
 else:
    # Not possible to update class docstrings on python2
    def add_start_docstrings(*docstr):
@@ -66,6 +72,11 @@ else:
            return fn
        return docstring_decorator

+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            return fn
+        return docstring_decorator
+

 class PretrainedConfig(object):
    r""" Base class for all configuration classes.

--- a/pytorch_transformers/tests/modeling_auto_test.py
+++ b/pytorch_transformers/tests/modeling_auto_test.py
@@ -21,7 +21,11 @@ import shutil
 import pytest
 import logging

-from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel
+from pytorch_transformers import (AutoConfig, BertConfig,
+                                  AutoModel, BertModel,
+                                  AutoModelWithLMHead, BertForMaskedLM,
+                                  AutoModelForSequenceClassification, BertForSequenceClassification,
+                                  AutoModelForQuestionAnswering, BertForQuestionAnswering)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP

 from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
@@ -42,6 +46,42 @@ class AutoModelTest(unittest.TestCase):
            for value in loading_info.values():
                self.assertEqual(len(value), 0)

+    def test_lmhead_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelWithLMHead.from_pretrained(model_name)
+            model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForMaskedLM)
+
+    def test_sequence_classification_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForSequenceClassification)
+
+    def test_question_answering_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+            model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForQuestionAnswering)
+

 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
@@ -25,6 +25,7 @@ from .tokenization_transfo_xl import TransfoXLTokenizer
 from .tokenization_xlnet import XLNetTokenizer
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
+from .tokenization_distilbert import DistilBertTokenizer

 logger = logging.getLogger(__name__)

@@ -39,13 +40,14 @@ class AutoTokenizer(object):

        The tokenizer class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
+            - contains `roberta`: RobertaTokenizer (RoBERTa model)
            - contains `bert`: BertTokenizer (Bert model)
            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
            - contains `xlnet`: XLNetTokenizer (XLNet model)
            - contains `xlm`: XLMTokenizer (XLM model)
-            - contains `roberta`: RobertaTokenizer (RoBERTa model)

        This class cannot be instantiated using `__init__()` (throw an error).
    """
@@ -60,32 +62,45 @@ class AutoTokenizer(object):

        The tokenizer class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
+            - contains `roberta`: RobertaTokenizer (XLM model)
            - contains `bert`: BertTokenizer (Bert model)
            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
            - contains `xlnet`: XLNetTokenizer (XLNet model)
            - contains `xlm`: XLMTokenizer (XLM model)
-            - contains `roberta`: RobertaTokenizer (XLM model)

        Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a saved configuration `file`.
-            **cache_dir**: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.

        Examples::

-            config = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
-            config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
+            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`

        """
-        if 'roberta' in pretrained_model_name_or_path:
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)