Merge pull request #1110 from huggingface/automodels

Torch.hub now based on AutoModels - Updating AutoModels with AutoModelWithLMHead, Sequence Classification and Question Answering

Merge pull request #1110 from huggingface/automodels
Torch.hub now based on AutoModels - Updating AutoModels with AutoModelWithLMHead, Sequence Classification and Question Answering
12b9cc9e · Thomas Wolf · GitHub · 282c276e · bfe93a5a · 12b9cc9e
Unverified Commit 12b9cc9e authored Aug 30, 2019 by Thomas Wolf Committed by GitHub Aug 30, 2019
12 changed files
--- a/hubconf.py
+++ b/hubconf.py
-dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex']
+from pytorch_transformers import (
+    AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
-from hubconfs.bert_hubconf import (
-    bertTokenizer,
-    bertModel,
-    bertForNextSentencePrediction,
-    bertForPreTraining,
-    bertForMaskedLM,
-    bertForSequenceClassification,
-    bertForMultipleChoice,
-    bertForQuestionAnswering,
-    bertForTokenClassification
-)
-from hubconfs.gpt_hubconf import (
-    openAIGPTTokenizer,
-    openAIGPTModel,
-    openAIGPTLMHeadModel,
-    openAIGPTDoubleHeadsModel
-)
-from hubconfs.gpt2_hubconf import (
-    gpt2Tokenizer,
-    gpt2Model,
-    gpt2LMHeadModel,
-    gpt2DoubleHeadsModel
-)
-from hubconfs.transformer_xl_hubconf import (
-    transformerXLTokenizer,
-    transformerXLModel,
-    transformerXLLMHeadModel
 )
+from pytorch_transformers.modeling_utils import add_start_docstrings
+dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
+@add_start_docstrings(AutoConfig.__doc__)
+def config(*args, **kwargs):
+    r""" 
+                # Using torch.hub !
+                import torch
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased')  # Download configuration from S3 and cache.
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', './test/bert_saved_model/my_configuration.json')
+                config = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False)
+                assert config.output_attention == True
+                config, unused_kwargs = torch.hub.load('huggingface/pytorch-transformers', 'config', 'bert-base-uncased', output_attention=True, foo=False, return_unused_kwargs=True)
+                assert config.output_attention == True
+                assert unused_kwargs == {'foo': False}
+            """
+    return AutoConfig.from_pretrained(*args, **kwargs)
+@add_start_docstrings(AutoTokenizer.__doc__)
+def tokenizer(*args, **kwargs):
+    r""" 
+        # Using torch.hub !
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-uncased')    # Download vocabulary from S3 and cache.
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', './test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+    """
+    return AutoTokenizer.from_pretrained(*args, **kwargs)
+@add_start_docstrings(AutoModel.__doc__)
+def model(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = torch.hub.load('huggingface/pytorch-transformers', 'model', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        """
+    return AutoModel.from_pretrained(*args, **kwargs)
+@add_start_docstrings(AutoModelWithLMHead.__doc__)
+def modelWithLMHead(*args, **kwargs):
+    r"""
+        # Using torch.hub !
+        import torch
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+    """
+    return AutoModelWithLMHead.from_pretrained(*args, **kwargs)
+@add_start_docstrings(AutoModelForSequenceClassification.__doc__)
+def modelForSequenceClassification(*args, **kwargs):
+    r"""
+            # Using torch.hub !
+            import torch
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        """
+    return AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
+@add_start_docstrings(AutoModelForQuestionAnswering.__doc__)
+def modelForQuestionAnswering(*args, **kwargs):
+    r"""
+        # Using torch.hub !
+        import torch
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+        model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', './tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+    """
+    return AutoModelForQuestionAnswering.from_pretrained(*args, **kwargs)
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
-from pytorch_transformers.tokenization_bert import BertTokenizer
-from pytorch_transformers.modeling_bert import (
-        BertModel,
-        BertForNextSentencePrediction,
-        BertForMaskedLM,
-        BertForMultipleChoice,
-        BertForPreTraining,
-        BertForQuestionAnswering,
-        BertForSequenceClassification,
-        BertForTokenClassification,
-        )
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-bert_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load
-                . `bert-base-uncased`
-                . `bert-large-uncased`
-                . `bert-base-cased`
-                . `bert-large-cased`
-                . `bert-base-multilingual-uncased`
-                . `bert-base-multilingual-cased`
-                . `bert-base-chinese`
-                . `bert-base-german-cased`
-                . `bert-large-uncased-whole-word-masking`
-                . `bert-large-cased-whole-word-masking`
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining
-                  instance
-            - a path or url to a pretrained model archive containing:
-                . `bert_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow
-                 checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models
-                   will be cached.
-        state_dict: an optional state dictionary
-                    (collections.OrderedDict object) to use instead of Google
-                    pre-trained models
-        *inputs, **kwargs: additional input for the specific Bert class
-            (ex: num_labels for BertForSequenceClassification)
-"""
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-def bertTokenizer(*args, **kwargs):
-    """
-    Instantiate a BertTokenizer from a pre-trained/customized vocab file
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * bert-base-uncased
-                                       * bert-large-uncased
-                                       * bert-base-cased
-                                       * bert-large-cased
-                                       * bert-base-multilingual-uncased
-                                       * bert-base-multilingual-cased
-                                       * bert-base-chinese
-    Keyword args:
-    cache_dir: an optional path to a specific directory to download and cache
-               the pre-trained model weights.
-               Default: None
-    do_lower_case: Whether to lower case the input.
-                   Only has an effect when do_wordpiece_only=False
-                   Default: True
-    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-                       Default: True
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-             Default: None
-    never_split: List of tokens which will never be split during tokenization.
-                 Only has an effect when do_wordpiece_only=False
-                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
-    Example:
-        import torch
-        sentence = 'Hello, World!'
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        toks = tokenizer.tokenize(sentence)
-        ['Hello', '##,', 'World', '##!']
-        ids = tokenizer.convert_tokens_to_ids(toks)
-        [8667, 28136, 1291, 28125]
-    """
-    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-@_append_from_pretrained_docstring(bert_docstring)
-def bertModel(*args, **kwargs):
-    """
-    BertModel is the basic BERT Transformer model with a layer of summed token,
-    position and sequence embeddings followed by a series of identical
-    self-attention blocks (12 for BERT-base, 24 for BERT-large).
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                encoded_layers, _ = model(tokens_tensor, segments_tensors)
-    """
-    model = BertModel.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForNextSentencePrediction(*args, **kwargs):
-    """
-    BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence
-    classification head.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForNextSentencePrediction
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
-        model.eval()
-        # Predict the next sentence classification logits
-        with torch.no_grad():
-                next_sent_classif_logits = model(tokens_tensor, segments_tensors)
-    """
-    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForPreTraining(*args, **kwargs):
-    """
-    BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads
-        - the masked language modeling head, and
-        - the next sentence classification head.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForPreTraining
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
-        masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
-    """
-    model = BertForPreTraining.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMaskedLM(*args, **kwargs):
-    """
-    BertForMaskedLM includes the BertModel Transformer followed by the
-    (possibly) pre-trained masked language modeling head.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        masked_index = 8
-        tokenized_text[masked_index] = '[MASK]'
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForMaskedLM
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
-        model.eval()
-        # Predict all tokens
-        with torch.no_grad():
-                predictions = model(tokens_tensor, segments_tensors)
-        predicted_index = torch.argmax(predictions[0, masked_index]).item()
-        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        'henson'
-    """
-    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForSequenceClassification(*args, **kwargs):
-    """
-    BertForSequenceClassification is a fine-tuning model that includes
-    BertModel and a sequence-level (sequence or pair of sequences) classifier
-    on top of the BertModel. Note that the classification head is only initialized
-    and has to be trained.
-    The sequence-level classifier is a linear layer that takes as input the
-    last hidden state of the first character in the input sequence
-    (see Figures 3a and 3b in the BERT paper).
-    Args:
-    num_labels: the number (>=2) of classes for the classifier.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForSequenceClassification
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
-        model.eval()
-        # Predict the sequence classification logits
-        with torch.no_grad():
-                seq_classif_logits = model(tokens_tensor, segments_tensors)
-        # Or get the sequence classification loss
-        labels = torch.tensor([1])
-        seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForMultipleChoice(*args, **kwargs):
-    """
-    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
-    linear layer on top of the BertModel. Note that the multiple choice head is
-    only initialized and has to be trained.
-    Args:
-    num_choices: the number (>=2) of classes for the classifier.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
-        segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
-        # Load bertForMultipleChoice
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
-        model.eval()
-        # Predict the multiple choice logits
-        with torch.no_grad():
-                multiple_choice_logits = model(tokens_tensor, segments_tensors)
-        # Or get the multiple choice loss
-        labels = torch.tensor([1])
-        multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForQuestionAnswering(*args, **kwargs):
-    """
-    BertForQuestionAnswering is a fine-tuning model that includes BertModel
-    with a token-level classifiers on top of the full sequence of last hidden
-    states. Note that the classification head is only initialized
-    and has to be trained.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForQuestionAnswering
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
-        model.eval()
-        # Predict the start and end positions logits
-        with torch.no_grad():
-                start_logits, end_logits = model(tokens_tensor, segments_tensors)
-        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
-        start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
-        # set model.train() before if training this loss
-        multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
-    """
-    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(bert_docstring)
-def bertForTokenClassification(*args, **kwargs):
-    """
-    BertForTokenClassification is a fine-tuning model that includes BertModel
-    and a token-level classifier on top of the BertModel. Note that the classification
-    head is only initialized and has to be trained.
-    The token-level classifier is a linear layer that takes as input the last
-    hidden state of the sequence.
-    Args:
-    num_labels: the number (>=2) of classes for the classifier.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        #  Prepare tokenized input
-        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        tokens_tensor = torch.tensor([indexed_tokens])
-        segments_tensors = torch.tensor([segments_ids])
-        # Load bertForTokenClassification
-        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
-        model.eval()
-        # Predict the token classification logits
-        with torch.no_grad():
-                classif_logits = model(tokens_tensor, segments_tensors)
-        # Or get the token classification loss
-        labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
-        classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
-    """
-    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
-    return model
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
-from pytorch_transformers.modeling_gpt2 import (
-    GPT2Model,
-    GPT2LMHeadModel,
-    GPT2DoubleHeadsModel
-)
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-gpt2_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `gpt2`, `gpt2-medium`
-            - a path or url to a pretrained model archive containing:
-                . `gpt2_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
-            - a path or url to a pretrained model archive containing:
-                . `gpt2_config.json` a configuration file for the model
-                . a TensorFlow checkpoint with trained weights
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific GPT-2 class
-"""
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-def gpt2Tokenizer(*args, **kwargs):
-    """
-    Instantiate a GPT-2 BPE tokenizer for OpenAI GPT-2 from a pre-trained/customized vocab file.
-    Peculiarities:
-        - Byte-level BPE
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * gpt2
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-             Default: None
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2Model(*args, **kwargs):
-    """
-    gpt2Model is the basic OpenAI GPT-2 Transformer model based on
-    identical stacked masked self-attention blocks and pre-trained
-    on large scale dataset using language modeling signal.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-        # Load gpt2Model
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
-        model.eval()
-        # Predict hidden states features for each layer
-        # past can be used to reuse precomputed hidden state in a subsequent predictions
-        with torch.no_grad():
-                hidden_states_1, past = model(tokens_tensor_1)
-                hidden_states_2, past = model(tokens_tensor_2, past=past)
-    """
-    model = GPT2Model.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2LMHeadModel(*args, **kwargs):
-    """
-    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
-    tied (pre-trained) language modeling head on top.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-        # Load gpt2LMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
-        model.eval()
-        # Predict hidden states features for each layer
-        # past can be used to reuse precomputed hidden state in a subsequent predictions
-        with torch.no_grad():
-                predictions_1, past = model(tokens_tensor_1)
-                predictions_2, past = model(tokens_tensor_2, past=past)
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(gpt2_docstring)
-def gpt2DoubleHeadsModel(*args, **kwargs):
-    """
-    gpt2DoubleHeadsModel is the OpenAI GPT-2 Transformer model with the
-    tied (pre-trained) language modeling head and a multiple choice
-    classification head (only initialized, not pre-trained).
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
-        #  Prepare tokenized input
-        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        tokenized_text1 = tokenizer.tokenize(text1)
-        tokenized_text2 = tokenizer.tokenize(text2)
-        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-        # Load gpt2DoubleHeadsModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
-    """
-    model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
-    return model
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
-from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
-from pytorch_transformers.modeling_openai import (
-	OpenAIGPTModel,
-	OpenAIGPTLMHeadModel,
-	OpenAIGPTDoubleHeadsModel
-)
-# Dependecies that are not specified in global hubconf.py
-specific_dependencies = ['spacy', 'ftfy']
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-gpt_docstring = """
-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-    The embeddings are ordered as follow in the token embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
-    Params:
-		pretrained_model_name_or_path: either:
-			- a str with the name of a pre-trained model to load selected in the list of:
-				. `openai-gpt`
-			- a path or url to a pretrained model archive containing:
-				. `openai_gpt_config.json` a configuration file for the model
-				. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
-			- a path or url to a pretrained model archive containing:
-				. `openai-gpt-config.json` a configuration file for the model
-				. a series of NumPy files containing OpenAI TensorFlow trained weights
-		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-		state_dict: an optional state dictionary (collections.OrderedDict object)
-		        	to use instead of pre-trained models
-		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
-"""
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-def openAIGPTTokenizer(*args, **kwargs):
-    """
-    Instantiate a BPE tokenizer for OpenAI GPT from a pre-trained/customized vocab file.
-	Peculiarities:
-        - lower case all inputs
-        - uses SpaCy tokenizer ('en' model) and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
-        - argument special_tokens and function set_special_tokens:
-            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * openai-gpt
-    Keyword args:
-	special_tokens: Special tokens in vocabulary that are not pretrained ([SEP], [CLS]...)
-					Default: None
-	max_len: An artificial maximum length to truncate tokenized sequences to;
-        	 Effective maximum length is always the minimum of this
-             value (if specified) and the underlying BERT model's
-             sequence length.
-			 Default: None
-    Example:
-		import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-		text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
-    """
-    tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTModel(*args, **kwargs):
-    """
-    OpenAIGPTModel is the basic OpenAI GPT Transformer model based on
-	identical stacked masked self-attention blocks and pre-trained
-	on large scale dataset using language modeling signal.
-    Example:
-        # Load the tokenizer
-		import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-        #  Prepare tokenized input
-        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        tokens_tensor = torch.tensor([indexed_tokens])
-        # Load openAIGPTModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states = model(tokens_tensor)
-    """
-    model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTLMHeadModel(*args, **kwargs):
-    """
-    OpenAIGPTLMHeadModel is the OpenAI GPT Transformer model with the
-	tied (pre-trained) language modeling head on top.
-	Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-        #  Prepare tokenized input
-        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        tokenized_text = tokenizer.tokenize(text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        tokens_tensor = torch.tensor([indexed_tokens])
-        # Load openAIGPTLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions = model(tokens_tensor)
-		# Get the predicted last token
-		predicted_index = torch.argmax(predictions[0, -1, :]).item()
-		predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        '.</w>'
-    """
-    model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(gpt_docstring)
-def openAIGPTDoubleHeadsModel(*args, **kwargs):
-    """
-    OpenAIGPTDoubleHeadsModel is the OpenAI GPT Transformer model with the
-	tied (pre-trained) language modeling head and a multiple choice
-	classification head (only initialized, not pre-trained).
-	Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
-        #  Prepare tokenized input
-        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        tokenized_text1 = tokenizer.tokenize(text1)
-        tokenized_text2 = tokenizer.tokenize(text2)
-        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-        # Load openAIGPTDoubleHeadsModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
-    """
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
-    return model
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
-from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
-from pytorch_transformers.modeling_transfo_xl import (
-    TransfoXLModel,
-    TransfoXLLMHeadModel
-)
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-transformer_xl_docstring = """
-    Transformer XL use a relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
-    - you don't need to specify positioning embeddings indices
-    - the tokens in the vocabulary have to be sorted to decreasing frequency.
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `transfo-xl-wt103`
-            - a path or url to a pretrained model archive containing:
-                . `transfo_xl_config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a TransfoXLModel instance
-            - a path or url to a pretrained model archive containing:
-                . `transfo_xl_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific TransformerXL class
-"""
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-def transformerXLTokenizer(*args, **kwargs):
-    """
-    Instantiate a Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * transfo-xl-wt103
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-        text = "Who was Jim Henson ?"
-        tokenized_text = tokenizer.tokenize(tokenized_text)
-        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-    """
-    tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-@_append_from_pretrained_docstring(transformer_xl_docstring)
-def transformerXLModel(*args, **kwargs):
-    """
-    transformerXLModel is the basic Transformer XL model.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        tokenized_text_1 = tokenizer.tokenize(text_1)
-        tokenized_text_2 = tokenizer.tokenize(text_2)
-        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-        # Load transformerXLModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
-        model.eval()
-        # Predict hidden states features for each layer
-        # We can re-use the memory cells in a subsequent call to attend a longer context
-        with torch.no_grad():
-                hidden_states_1, mems_1 = model(tokens_tensor_1)
-                hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
-    """
-    model = TransfoXLModel.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(transformer_xl_docstring)
-def transformerXLLMHeadModel(*args, **kwargs):
-    """
-    transformerXLModel is the basic Transformer XL model with the
-    tied (pre-trained) language modeling head on top.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        tokenized_text_1 = tokenizer.tokenize(text_1)
-        tokenized_text_2 = tokenizer.tokenize(text_2)
-        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-        # Load transformerXLLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
-        model.eval()
-        # Predict hidden states features for each layer
-        # We can re-use the memory cells in a subsequent call to attend a longer context
-        with torch.no_grad():
-                predictions_1, mems_1 = model(tokens_tensor_1)
-                predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        assert predicted_token == 'who'
-    """
-    model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
--- a/hubconfs/xlm_hubconf.py
+++ b/hubconfs/xlm_hubconf.py
-from pytorch_transformers.tokenization_xlm import XLMTokenizer
-from pytorch_transformers.modeling_xlm import (
-    XLMConfig,
-    XLMModel,
-    XLMWithLMHeadModel,
-    XLMForSequenceClassification,
-    XLMForQuestionAnswering
-)
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlm_start_docstring = """
-    Model class adapted from the XLM Transformer model of
-        "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
-        Paper: https://arxiv.org/abs/1901.07291
-        Original code: https://github.com/facebookresearch/XLM
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-"""
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlm_end_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `xlm-mlm-en-2048`
-            - a path or url to a pretrained model archive containing:
-                . `config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific XLM class
-"""
-def _begin_with_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-def _end_with_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-def xlmTokenizer(*args, **kwargs):
-    """
-    Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * xlm-mlm-en-2048
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying model's
-             sequence length.
-             Default: None
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-@_begin_with_docstring(xlm_start_docstring)
-@_end_with_docstring(xlm_end_docstring)
-def xlmModel(*args, **kwargs):
-    """
-        # Load xlmModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states_1, mems = model(tokens_tensor_1)
-                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
-    """
-    model = XLMModel.from_pretrained(*args, **kwargs)
-    return model
-@_begin_with_docstring(xlm_start_docstring)
-@_end_with_docstring(xlm_end_docstring)
-def xlmLMHeadModel(*args, **kwargs):
-    """
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-        # Load xlnetLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions_1, mems = model(tokens_tensor_1)
-                predictions_2, mems = model(tokens_tensor_2, mems=mems)
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-# @_end_with_docstring(xlnet_docstring)
-# def xlnetForSequenceClassification(*args, **kwargs):
-#     """
-#     xlnetModel is the basic XLNet Transformer model from
-#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-#     Example:
-#         # Load the tokenizer
-#         import torch
-#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
-#         #  Prepare tokenized input
-#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         tokenized_text1 = tokenizer.tokenize(text1)
-#         tokenized_text2 = tokenizer.tokenize(text2)
-#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-#         # Load xlnetForSequenceClassification
-#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
-#         model.eval()
-#         # Predict sequence classes logits
-#         with torch.no_grad():
-#                 lm_logits, mems = model(tokens_tensor)
-#     """
-#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
-#     return model
--- a/hubconfs/xlnet_hubconf.1.py
+++ b/hubconfs/xlnet_hubconf.1.py
-from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
-from pytorch_transformers.modeling_xlnet import (
-    XLNetConfig,
-    XLNetModel,
-    XLNetLMHeadModel,
-    # XLNetForSequenceClassification
-)
-# A lot of models share the same param doc. Use a decorator
-# to save typing
-xlnet_docstring = """
-    Params:
-        pretrained_model_name_or_path: either:
-            - a str with the name of a pre-trained model to load selected in the list of:
-                . `xlnet-large-cased`
-            - a path or url to a pretrained model archive containing:
-                . `config.json` a configuration file for the model
-                . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
-            - a path or url to a pretrained model archive containing:
-                . `xlnet_config.json` a configuration file for the model
-                . `model.chkpt` a TensorFlow checkpoint
-        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-        *inputs, **kwargs: additional input for the specific XLNet class
-"""
-def _append_from_pretrained_docstring(docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + docstr
-        return fn
-    return docstring_decorator
-def xlnetTokenizer(*args, **kwargs):
-    """
-    Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
-    Peculiarities:
-        - require Google sentencepiece (https://github.com/google/sentencepiece)
-    Args:
-    pretrained_model_name_or_path: Path to pretrained model archive
-                                   or one of pre-trained vocab configs below.
-                                       * xlnet-large-cased
-    Keyword args:
-    special_tokens: Special tokens in vocabulary that are not pretrained
-                    Default: None
-    max_len: An artificial maximum length to truncate tokenized sequences to;
-             Effective maximum length is always the minimum of this
-             value (if specified) and the underlying model's
-             sequence length.
-             Default: None
-    Example:
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-        text = "Who was Jim Henson ?"
-        indexed_tokens = tokenizer.encode(tokenized_text)
-    """
-    tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
-    return tokenizer
-@_append_from_pretrained_docstring(xlnet_docstring)
-def xlnetModel(*args, **kwargs):
-    """
-    xlnetModel is the basic XLNet Transformer model from
-        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-        # Load xlnetModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                hidden_states_1, mems = model(tokens_tensor_1)
-                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
-    """
-    model = XLNetModel.from_pretrained(*args, **kwargs)
-    return model
-@_append_from_pretrained_docstring(xlnet_docstring)
-def xlnetLMHeadModel(*args, **kwargs):
-    """
-    xlnetModel is the basic XLNet Transformer model from
-        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-    with a tied (pre-trained) language modeling head on top.
-    Example:
-        # Load the tokenizer
-        import torch
-        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-        #  Prepare tokenized input
-        text_1 = "Who was Jim Henson ?"
-        text_2 = "Jim Henson was a puppeteer"
-        indexed_tokens_1 = tokenizer.encode(text_1)
-        indexed_tokens_2 = tokenizer.encode(text_2)
-        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
-        # Load xlnetLMHeadModel
-        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
-        model.eval()
-        # Predict hidden states features for each layer
-        with torch.no_grad():
-                predictions_1, mems = model(tokens_tensor_1)
-                predictions_2, mems = model(tokens_tensor_2, mems=mems)
-        # Get the predicted last token
-        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        predicted_token = tokenizer.decode([predicted_index])
-        assert predicted_token == ' who'
-    """
-    model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
-    return model
-# @_append_from_pretrained_docstring(xlnet_docstring)
-# def xlnetForSequenceClassification(*args, **kwargs):
-#     """
-#     xlnetModel is the basic XLNet Transformer model from
-#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
-#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
-#     Example:
-#         # Load the tokenizer
-#         import torch
-#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
-#         #  Prepare tokenized input
-#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         tokenized_text1 = tokenizer.tokenize(text1)
-#         tokenized_text2 = tokenizer.tokenize(text2)
-#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
-#         # Load xlnetForSequenceClassification
-#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
-#         model.eval()
-#         # Predict sequence classes logits
-#         with torch.no_grad():
-#                 lm_logits, mems = model(tokens_tensor)
-#     """
-#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
-#     return model
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -11,7 +11,8 @@ from .tokenization_distilbert import DistilBertTokenizer
 from .tokenization_utils import (PreTrainedTokenizer)
-from .modeling_auto import (AutoConfig, AutoModel)
+from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
+                            AutoModelWithLMHead)
 from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
                            BertForMaskedLM, BertForNextSentencePrediction,

--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -59,6 +59,12 @@ if not six.PY2:
            fn.__doc__ = ''.join(docstr) + fn.__doc__
            return fn
        return docstring_decorator
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            fn.__doc__ = fn.__doc__ + ''.join(docstr)
+            return fn
+        return docstring_decorator
 else:
    # Not possible to update class docstrings on python2
    def add_start_docstrings(*docstr):
@@ -66,6 +72,11 @@ else:
            return fn
        return docstring_decorator
+    def add_end_docstrings(*docstr):
+        def docstring_decorator(fn):
+            return fn
+        return docstring_decorator
 class PretrainedConfig(object):
    r""" Base class for all configuration classes.

--- a/pytorch_transformers/tests/modeling_auto_test.py
+++ b/pytorch_transformers/tests/modeling_auto_test.py
@@ -21,7 +21,11 @@ import shutil
 import pytest
 import logging
-from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel
+from pytorch_transformers import (AutoConfig, BertConfig,
+                                  AutoModel, BertModel,
+                                  AutoModelWithLMHead, BertForMaskedLM,
+                                  AutoModelForSequenceClassification, BertForSequenceClassification,
+                                  AutoModelForQuestionAnswering, BertForQuestionAnswering)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
@@ -42,6 +46,42 @@ class AutoModelTest(unittest.TestCase):
            for value in loading_info.values():
                self.assertEqual(len(value), 0)
+    def test_lmhead_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+            model = AutoModelWithLMHead.from_pretrained(model_name)
+            model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForMaskedLM)
+    def test_sequence_classification_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+            model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForSequenceClassification)
+    def test_question_answering_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+            model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+            model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertForQuestionAnswering)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
@@ -25,6 +25,7 @@ from .tokenization_transfo_xl import TransfoXLTokenizer
 from .tokenization_xlnet import XLNetTokenizer
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
+from .tokenization_distilbert import DistilBertTokenizer
 logger = logging.getLogger(__name__)
@@ -39,13 +40,14 @@ class AutoTokenizer(object):
        The tokenizer class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
+            - contains `roberta`: RobertaTokenizer (RoBERTa model)
            - contains `bert`: BertTokenizer (Bert model)
            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
            - contains `xlnet`: XLNetTokenizer (XLNet model)
            - contains `xlm`: XLMTokenizer (XLM model)
-            - contains `roberta`: RobertaTokenizer (RoBERTa model)
        This class cannot be instantiated using `__init__()` (throw an error).
    """
@@ -60,32 +62,45 @@ class AutoTokenizer(object):
        The tokenizer class to instantiate is selected as the first pattern matching
        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertTokenizer (DistilBert model)
+            - contains `roberta`: RobertaTokenizer (XLM model)
            - contains `bert`: BertTokenizer (Bert model)
            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
            - contains `xlnet`: XLNetTokenizer (XLNet model)
            - contains `xlm`: XLMTokenizer (XLM model)
-            - contains `roberta`: RobertaTokenizer (XLM model)
        Params:
-            **pretrained_model_name_or_path**: either:
+            pretrained_model_name_or_path: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                    using the `save_pretrained(save_directory)` method.
+                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
-                - a path or url to a saved configuration `file`.
-            **cache_dir**: (`optional`) string:
+            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
-                configuration should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
        Examples::
-            config = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
+            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
-            config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
        """
-        if 'roberta' in pretrained_model_name_or_path:
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)