Merge remote-tracking branch 'huggingface/master'

6e1ac34e · erenup · 2a2832ce · caf1d116 · 6e1ac34e · 6e1ac34e
Commit 6e1ac34e authored Aug 30, 2019 by erenup
20 changed files
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -157,8 +157,8 @@ def train(args, train_dataset, model, tokenizer):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
-                scheduler.step()  # Update learning rate schedule
                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
@@ -272,7 +272,7 @@ def evaluate(args, model, tokenizer, prefix=""):
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
-    if args.local_rank not in [-1, 0]:
+    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    # Load data features from cache or dataset file
@@ -299,7 +299,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)
-    if args.local_rank == 0:
+    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    # Convert to Tensors and build dataset
@@ -481,7 +481,7 @@ def main():
    # Save the trained model and the tokenizer
-    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)
@@ -498,7 +498,7 @@ def main():
        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -81,7 +81,7 @@ class ExamplesTests(unittest.TestCase):
                    "--do_train",
                    "--do_eval",
                    "--version_2_with_negative",
-                    "--learning_rate=1e-4",
+                    "--learning_rate=2e-4",
                    "--per_gpu_train_batch_size=2",
                    "--per_gpu_eval_batch_size=1",
                    "--overwrite_output_dir",

--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -7,6 +7,7 @@ from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
+from .tokenization_distilbert import DistilBertTokenizer
 from .tokenization_utils import (PreTrainedTokenizer)
@@ -41,6 +42,9 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
 from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
                               RobertaForMultipleChoice,
                               ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_distilbert import (DistilBertConfig, DistilBertForMaskedLM, DistilBertModel,
+                               DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
+                               DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
                          PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)

--- a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
@@ -35,7 +35,7 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p
    if gpt2_config_file == "":
        config = GPT2Config()
    else:
-        config = GPT2Config(gpt2_config_file)
+        config = GPT2Config.from_json_file(gpt2_config_file)
    model = GPT2Model(config)
    # Load weights from numpy

--- a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
@@ -35,7 +35,7 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
    if openai_config_file == "":
        config = OpenAIGPTConfig()
    else:
-        config = OpenAIGPTConfig(openai_config_file)
+        config = OpenAIGPTConfig.from_json_file(openai_config_file)
    model = OpenAIGPTModel(config)
    # Load weights from numpy

--- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -75,7 +75,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
        if transfo_xl_config_file == "":
            config = TransfoXLConfig()
        else:
-            config = TransfoXLConfig(transfo_xl_config_file)
+            config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
        print("Building PyTorch model from configuration: {}".format(str(config)))
        model = TransfoXLLMHeadModel(config)

--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -17,8 +17,9 @@ from hashlib import sha256
 from io import open
 import boto3
-import requests
+from botocore.config import Config
 from botocore.exceptions import ClientError
+import requests
 from tqdm import tqdm
 try:
@@ -93,12 +94,15 @@ def filename_to_url(filename, cache_dir=None):
    return url, etag
-def cached_path(url_or_filename, cache_dir=None):
+def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
    """
    Given something that might be a URL (or might be a local path),
    determine which. If it's a URL, download the file and cache it, and
    return the path to the cached file. If it's already a local path,
    make sure the file exists and then return the path.
+    Args:
+        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
+        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
    """
    if cache_dir is None:
        cache_dir = PYTORCH_TRANSFORMERS_CACHE
@@ -111,7 +115,7 @@ def cached_path(url_or_filename, cache_dir=None):
    if parsed.scheme in ('http', 'https', 's3'):
        # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir)
+        return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
    elif os.path.exists(url_or_filename):
        # File, and it exists.
        return url_or_filename
@@ -156,24 +160,24 @@ def s3_request(func):
 @s3_request
-def s3_etag(url):
+def s3_etag(url, proxies=None):
    """Check ETag on S3 object."""
-    s3_resource = boto3.resource("s3")
+    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
    bucket_name, s3_path = split_s3_path(url)
    s3_object = s3_resource.Object(bucket_name, s3_path)
    return s3_object.e_tag
 @s3_request
-def s3_get(url, temp_file):
+def s3_get(url, temp_file, proxies=None):
    """Pull a file directly from S3."""
-    s3_resource = boto3.resource("s3")
+    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
    bucket_name, s3_path = split_s3_path(url)
    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
-def http_get(url, temp_file):
+def http_get(url, temp_file, proxies=None):
-    req = requests.get(url, stream=True)
+    req = requests.get(url, stream=True, proxies=proxies)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = tqdm(unit="B", total=total)
@@ -184,7 +188,7 @@ def http_get(url, temp_file):
    progress.close()
-def get_from_cache(url, cache_dir=None):
+def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
    """
    Given a URL, look for the corresponding dataset in the local cache.
    If it's not there, download it. Then return the path to the cached file.
@@ -201,10 +205,10 @@ def get_from_cache(url, cache_dir=None):
    # Get eTag to add to filename, if it exists.
    if url.startswith("s3://"):
-        etag = s3_etag(url)
+        etag = s3_etag(url, proxies=proxies)
    else:
        try:
-            response = requests.head(url, allow_redirects=True)
+            response = requests.head(url, allow_redirects=True, proxies=proxies)
            if response.status_code != 200:
                etag = None
            else:
@@ -227,17 +231,17 @@ def get_from_cache(url, cache_dir=None):
        if matching_files:
            cache_path = os.path.join(cache_dir, matching_files[-1])
-    if not os.path.exists(cache_path):
+    if not os.path.exists(cache_path) or force_download:
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        with tempfile.NamedTemporaryFile() as temp_file:
-            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
            # GET file object
            if url.startswith("s3://"):
-                s3_get(url, temp_file)
+                s3_get(url, temp_file, proxies=proxies)
            else:
-                http_get(url, temp_file)
+                http_get(url, temp_file, proxies=proxies)
            # we are copying the file before closing it, so flush to avoid truncation
            temp_file.flush()

--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -30,6 +30,7 @@ from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
 from .modeling_xlnet import XLNetConfig, XLNetModel
 from .modeling_xlm import XLMConfig, XLMModel
 from .modeling_roberta import RobertaConfig, RobertaModel
+from .modeling_distilbert import DistilBertConfig, DistilBertModel
 from .modeling_utils import PreTrainedModel, SequenceSummary
@@ -110,7 +111,9 @@ class AutoConfig(object):
            assert unused_kwargs == {'foo': False}
        """
-        if 'roberta' in pretrained_model_name_or_path:
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -225,7 +228,9 @@ class AutoModel(object):
            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
-        if 'roberta' in pretrained_model_name_or_path:
+        if 'distilbert' in pretrained_model_name_or_path:
+            return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
            return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
        elif 'bert' in pretrained_model_name_or_path:
            return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -216,7 +216,7 @@ class BertConfig(PretrainedConfig):
            self.layer_norm_eps = layer_norm_eps
        else:
            raise ValueError("First argument must be either a vocabulary size (int)"
-                             "or the path to a pretrained model config file (str)")
+                             " or the path to a pretrained model config file (str)")
@@ -449,7 +449,7 @@ class BertEncoder(nn.Module):
            outputs = outputs + (all_hidden_states,)
        if self.output_attentions:
            outputs = outputs + (all_attentions,)
-        return outputs  # outputs, (hidden states), (attentions)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 class BertPooler(nn.Module):
@@ -577,7 +577,9 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 BERT_INPUTS_DOCSTRING = r"""
@@ -597,7 +599,10 @@ BERT_INPUTS_DOCSTRING = r"""
                ``tokens:         [CLS] the dog is hairy . [SEP]``
                ``token_type_ids:   0   0   0   0  0     0   0``
+            Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
            Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.

--- a/pytorch_transformers/modeling_distilbert.py
+++ b/pytorch_transformers/modeling_distilbert.py
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DistilBERT model
+    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
+    and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert)
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+import json
+import logging
+import math
+import copy
+import sys
+from io import open
+import itertools
+import numpy as np
+import torch
+import torch.nn as nn
+from pytorch_transformers.modeling_utils import PretrainedConfig, PreTrainedModel, add_start_docstrings, prune_linear_layer
+import logging
+logger = logging.getLogger(__name__)
+DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin",
+    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
+}
+DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
+    'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
+}
+class DistilBertConfig(PretrainedConfig):
+    pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 max_position_embeddings=512,
+                 sinusoidal_pos_embds=True,
+                 n_layers=6,
+                 n_heads=12,
+                 dim=768,
+                 hidden_dim=4*768,
+                 dropout=0.1,
+                 attention_dropout=0.1,
+                 activation='gelu',
+                 initializer_range=0.02,
+                 tie_weights_=True,
+                 qa_dropout=0.1,
+                 seq_classif_dropout=0.2,
+                 **kwargs):
+        super(DistilBertConfig, self).__init__(**kwargs)
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.max_position_embeddings = max_position_embeddings
+            self.sinusoidal_pos_embds = sinusoidal_pos_embds
+            self.n_layers = n_layers
+            self.n_heads = n_heads
+            self.dim = dim
+            self.hidden_dim = hidden_dim
+            self.dropout = dropout
+            self.attention_dropout = attention_dropout
+            self.activation = activation
+            self.initializer_range = initializer_range
+            self.tie_weights_ = tie_weights_
+            self.qa_dropout = qa_dropout
+            self.seq_classif_dropout = seq_classif_dropout
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             " or the path to a pretrained model config file (str)")
+    @property
+    def hidden_size(self):
+        return self.dim
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
+### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
+def gelu(x):
+    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([
+        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
+        for pos in range(n_pos)
+    ])
+    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    out.detach_()
+    out.requires_grad = False
+class Embeddings(nn.Module):
+    def __init__(self,
+                 config):
+        super(Embeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
+        if config.sinusoidal_pos_embds:
+            create_sinusoidal_embeddings(n_pos=config.max_position_embeddings,
+                                         dim=config.dim,
+                                         out=self.position_embeddings.weight)
+        self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, input_ids):
+        """
+        Parameters
+        ----------
+        input_ids: torch.tensor(bs, max_seq_length)
+            The token ids to embed.
+        Outputs
+        -------
+        embeddings: torch.tensor(bs, max_seq_length, dim)
+            The embedded tokens (plus position embeddings, no token_type embeddings)
+        """
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)                      # (bs, max_seq_length)
+        word_embeddings = self.word_embeddings(input_ids)                   # (bs, max_seq_length, dim)
+        position_embeddings = self.position_embeddings(position_ids)        # (bs, max_seq_length, dim)
+        embeddings = word_embeddings + position_embeddings  # (bs, max_seq_length, dim)
+        embeddings = self.LayerNorm(embeddings)             # (bs, max_seq_length, dim)
+        embeddings = self.dropout(embeddings)               # (bs, max_seq_length, dim)
+        return embeddings
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(MultiHeadSelfAttention, self).__init__()
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+        self.output_attentions = config.output_attentions
+        assert self.dim % self.n_heads == 0
+        self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+        self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+        self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+        self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
+    def prune_heads(self, heads):
+        attention_head_size = self.dim // self.n_heads
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_heads, attention_head_size)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.q_lin = prune_linear_layer(self.q_lin, index)
+        self.k_lin = prune_linear_layer(self.k_lin, index)
+        self.v_lin = prune_linear_layer(self.v_lin, index)
+        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = attention_head_size * self.n_heads
+    def forward(self, query, key, value, mask, head_mask = None):
+        """
+        Parameters
+        ----------
+        query: torch.tensor(bs, seq_length, dim)
+        key: torch.tensor(bs, seq_length, dim)
+        value: torch.tensor(bs, seq_length, dim)
+        mask: torch.tensor(bs, seq_length)
+        Outputs
+        -------
+        weights: torch.tensor(bs, n_heads, seq_length, seq_length)
+            Attention weights
+        context: torch.tensor(bs, seq_length, dim)
+            Contextualized layer. Optional: only if `output_attentions=True`
+        """
+        bs, q_length, dim = query.size()
+        k_length = key.size(1)
+        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        # assert key.size() == value.size()
+        dim_per_head = self.dim // self.n_heads
+        assert 2 <= mask.dim() <= 3
+        causal = (mask.dim() == 3)
+        mask_reshp = (bs, 1, 1, k_length)
+        def shape(x):
+            """ separate heads """
+            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+        def unshape(x):
+            """ group heads """
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+        q = shape(self.q_lin(query))           # (bs, n_heads, q_length, dim_per_head)
+        k = shape(self.k_lin(key))             # (bs, n_heads, k_length, dim_per_head)
+        v = shape(self.v_lin(value))           # (bs, n_heads, k_length, dim_per_head)
+        q = q / math.sqrt(dim_per_head)                     # (bs, n_heads, q_length, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2,3))          # (bs, n_heads, q_length, k_length)
+        mask = (mask==0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length)
+        scores.masked_fill_(mask, -float('inf'))            # (bs, n_heads, q_length, k_length)
+        weights = nn.Softmax(dim=-1)(scores)   # (bs, n_heads, q_length, k_length)
+        weights = self.dropout(weights)        # (bs, n_heads, q_length, k_length)
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+        context = torch.matmul(weights, v)     # (bs, n_heads, q_length, dim_per_head)
+        context = unshape(context)             # (bs, q_length, dim)
+        context = self.out_lin(context)        # (bs, q_length, dim)
+        if self.output_attentions:
+            return (context, weights)
+        else:
+            return (context,)
+class FFN(nn.Module):
+    def __init__(self, config):
+        super(FFN, self).__init__()
+        self.dropout = nn.Dropout(p=config.dropout)
+        self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
+        self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
+        assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation)
+        self.activation = gelu if config.activation == 'gelu' else nn.ReLU()
+    def forward(self, input):
+        x = self.lin1(input)
+        x = self.activation(x)
+        x = self.lin2(x)
+        x = self.dropout(x)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, config):
+        super(TransformerBlock, self).__init__()
+        self.n_heads = config.n_heads
+        self.dim = config.dim
+        self.hidden_dim = config.hidden_dim
+        self.dropout = nn.Dropout(p=config.dropout)
+        self.activation = config.activation
+        self.output_attentions = config.output_attentions
+        assert config.dim % config.n_heads == 0
+        self.attention = MultiHeadSelfAttention(config)
+        self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
+        self.ffn = FFN(config)
+        self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
+    def forward(self, x, attn_mask=None, head_mask=None):
+        """
+        Parameters
+        ----------
+        x: torch.tensor(bs, seq_length, dim)
+        attn_mask: torch.tensor(bs, seq_length)
+        Outputs
+        -------
+        sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length)
+            The attention weights
+        ffn_output: torch.tensor(bs, seq_length, dim)
+            The output of the transformer block contextualization.
+        """
+        # Self-Attention
+        sa_output = self.attention(query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask)
+        if self.output_attentions:
+            sa_output, sa_weights = sa_output                  # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length)
+        else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples
+            assert type(sa_output) == tuple
+            sa_output = sa_output[0]
+        sa_output = self.sa_layer_norm(sa_output + x)          # (bs, seq_length, dim)
+        # Feed Forward Network
+        ffn_output = self.ffn(sa_output)                             # (bs, seq_length, dim)
+        ffn_output = self.output_layer_norm(ffn_output + sa_output)  # (bs, seq_length, dim)
+        output = (ffn_output,)
+        if self.output_attentions:
+            output = (sa_weights,) + output
+        return output
+class Transformer(nn.Module):
+    def __init__(self, config):
+        super(Transformer, self).__init__()
+        self.n_layers = config.n_layers
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        layer = TransformerBlock(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)])
+    def forward(self, x, attn_mask=None, head_mask=None):
+        """
+        Parameters
+        ----------
+        x: torch.tensor(bs, seq_length, dim)
+            Input sequence embedded.
+        attn_mask: torch.tensor(bs, seq_length)
+            Attention mask on the sequence.
+        Outputs
+        -------
+        hidden_state: torch.tensor(bs, seq_length, dim)
+            Sequence of hiddens states in the last (top) layer
+        all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
+            Tuple of length n_layers with the hidden states from each layer.
+            Optional: only if output_hidden_states=True
+        all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
+            Tuple of length n_layers with the attention weights from each layer
+            Optional: only if output_attentions=True
+        """
+        all_hidden_states = ()
+        all_attentions = ()
+        hidden_state = x
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+            layer_outputs = layer_module(x=hidden_state,
+                                         attn_mask=attn_mask,
+                                         head_mask=head_mask[i])
+            hidden_state = layer_outputs[-1]
+            if self.output_attentions:
+                assert len(layer_outputs) == 2
+                attentions = layer_outputs[0]
+                all_attentions = all_attentions + (attentions,)
+            else:
+                assert len(layer_outputs) == 1
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_state,)
+        outputs = (hidden_state,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ###
+class DistilBertPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for downloading and loading pretrained models.
+    """
+    config_class = DistilBertConfig
+    pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = None
+    base_model_prefix = "distilbert"
+    def __init__(self, *inputs, **kwargs):
+        super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs)
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, nn.Embedding):
+            if module.weight.requires_grad:
+                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+DISTILBERT_START_DOCSTRING = r"""
+    DistilBERT is a small, fast, cheap and light Transformer model
+    trained by distilling Bert base. It has 40% less parameters than
+    `bert-base-uncased`, runs 60% faster while preserving over 95% of
+    Bert's performances as measured on the GLUE language understanding benchmark.
+    Here are the differences between the interface of Bert and DistilBert:
+    - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
+    - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
+    For more information on DistilBERT, please refer to our
+    `detailed blog post`_
+    .. _`detailed blog post`:
+        https://medium.com/huggingface/distilbert-8cf3380435b5
+    Parameters:
+        config (:class:`~pytorch_transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+DISTILBERT_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids** ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The input sequences should start with `[CLS]` and end with `[SEP]` tokens.
+            For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT.
+        **attention_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class DistilBertModel(DistilBertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = DistilBertModel.from_pretrained('distilbert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+    """
+    def __init__(self, config):
+        super(DistilBertModel, self).__init__(config)
+        self.embeddings = Embeddings(config)   # Embeddings
+        self.transformer = Transformer(config) # Encoder
+        self.apply(self.init_weights)
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+        return self.embeddings.word_embeddings
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.transformer.layer[layer].attention.prune_heads(heads)
+    def forward(self,
+                input_ids, attention_mask=None, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids) # (bs, seq_length)
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+        embedding_output = self.embeddings(input_ids)   # (bs, seq_length, dim)
+        tfmr_output = self.transformer(x=embedding_output,
+                                       attn_mask=attention_mask,
+                                       head_mask=head_mask)
+        hidden_state = tfmr_output[0]
+        output = (hidden_state, ) + tfmr_output[1:]
+        return output # last-layer hidden-state, (all hidden_states), (all attentions)
+@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class DistilBertForMaskedLM(DistilBertPreTrainedModel):
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+    """
+    def __init__(self, config):
+        super(DistilBertForMaskedLM, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.distilbert = DistilBertModel(config)
+        self.vocab_transform = nn.Linear(config.dim, config.dim)
+        self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
+        self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
+        self.apply(self.init_weights)
+        self.tie_weights()
+        self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.vocab_projector,
+                                   self.distilbert.embeddings.word_embeddings)
+    def forward(self, input_ids, attention_mask=None, masked_lm_labels=None, head_mask=None):
+        dlbrt_output = self.distilbert(input_ids=input_ids,
+                                    attention_mask=attention_mask,
+                                    head_mask=head_mask)
+        hidden_states = dlbrt_output[0]                              # (bs, seq_length, dim)
+        prediction_logits = self.vocab_transform(hidden_states)      # (bs, seq_length, dim)
+        prediction_logits = gelu(prediction_logits)                  # (bs, seq_length, dim)
+        prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim)
+        prediction_logits = self.vocab_projector(prediction_logits)  # (bs, seq_length, vocab_size)
+        outputs = (prediction_logits, ) + dlbrt_output[1:]
+        if masked_lm_labels is not None:
+            mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)),
+                                         masked_lm_labels.view(-1))
+            outputs = (mlm_loss,) + outputs     
+        return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
+@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+                         the pooled output) e.g. for GLUE tasks. """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+    """
+    def __init__(self, config):
+        super(DistilBertForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.distilbert = DistilBertModel(config)
+        self.pre_classifier = nn.Linear(config.dim, config.dim)
+        self.classifier = nn.Linear(config.dim, config.num_labels)
+        self.dropout = nn.Dropout(config.seq_classif_dropout)
+        self.apply(self.init_weights)
+    def forward(self, input_ids,  attention_mask=None, labels=None, head_mask=None):
+        distilbert_output = self.distilbert(input_ids=input_ids,
+                                      attention_mask=attention_mask,
+                                      head_mask=head_mask)
+        hidden_state = distilbert_output[0]                    # (bs, seq_len, dim)
+        pooled_output = hidden_state[:, 0]                    # (bs, dim)
+        pooled_output = self.pre_classifier(pooled_output)   # (bs, dim)
+        pooled_output = nn.ReLU()(pooled_output)             # (bs, dim)
+        pooled_output = self.dropout(pooled_output)         # (bs, dim)
+        logits = self.classifier(pooled_output)              # (bs, dim)
+        outputs = (logits,) + distilbert_output[1:]
+        if labels is not None:
+            if self.num_labels == 1:
+                loss_fct = nn.MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+                         the hidden-states output to compute `span start logits` and `span end logits`). """,
+                      DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
+class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+        model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
+    """
+    def __init__(self, config):
+        super(DistilBertForQuestionAnswering, self).__init__(config)
+        self.distilbert = DistilBertModel(config)
+        self.qa_outputs = nn.Linear(config.dim, config.num_labels)
+        assert config.num_labels == 2
+        self.dropout = nn.Dropout(config.qa_dropout)
+        self.apply(self.init_weights)
+    def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None, head_mask=None):
+        distilbert_output = self.distilbert(input_ids=input_ids,
+                                      attention_mask=attention_mask,
+                                      head_mask=head_mask)
+        hidden_states = distilbert_output[0]                                 # (bs, max_query_len, dim)
+        hidden_states = self.dropout(hidden_states)                       # (bs, max_query_len, dim)
+        logits = self.qa_outputs(hidden_states)                           # (bs, max_query_len, 2)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)                           # (bs, max_query_len)
+        end_logits = end_logits.squeeze(-1)                               # (bs, max_query_len)
+        outputs = (start_logits, end_logits,) + distilbert_output[1:]
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -38,9 +38,11 @@ from .modeling_bert import BertLayerNorm as LayerNorm
 logger = logging.getLogger(__name__)
 GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
-                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
+                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
+                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
 GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
-                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
+                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
+                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
    """ Load tf checkpoints in a pytorch model
@@ -383,11 +385,15 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
    Parameters:
        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
+            GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
@@ -402,10 +408,6 @@ GPT2_INPUTS_DOCSTRING = r"""    Inputs:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `past` output below). Can be used to speed up sequential decoding.
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
@@ -612,7 +614,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 @add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
-the classification head takes as input the input of a specified classification token index in the intput sequence).
+the classification head takes as input the input of a specified classification token index in the input sequence).
 """, GPT2_START_DOCSTRING)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    r"""    Inputs:
@@ -636,10 +638,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
            (see `past` output below). Can be used to speed up sequential decoding.
-        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
@@ -650,14 +648,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
            All labels set to ``-1`` are ignored (masked), the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``
-        **multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
+        **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)
-            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
-                with indices selected in [0, ..., num_choices].
    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Language modeling loss.

--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -397,11 +397,15 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
    Parameters:
        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
+            GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
@@ -602,7 +606,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 @add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
-the classification head takes as input the input of a specified classification token index in the intput sequence).
+the classification head takes as input the input of a specified classification token index in the input sequence).
 """, OPENAI_GPT_START_DOCSTRING)
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    r"""    Inputs:

--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -90,25 +90,30 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
    Parameters:
        config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
-            model.
+            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 ROBERTA_INPUTS_DOCSTRING = r"""
    Inputs:
        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
-            To match pre-training, RoBERTa input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+            To match pre-training, RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
            (a) For sequence pairs:
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP][SEP] no it is not . [SEP]``
+                ``tokens:         <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
            (b) For single sequences:
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                ``tokens:         <s> the dog is hairy . </s>``
            Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with 
            the ``add_special_tokens`` parameter set to ``True``.
+            RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:

--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -285,7 +285,7 @@ class TransfoXLConfig(PretrainedConfig):
            self.init_std = init_std
        else:
            raise ValueError("First argument must be either a vocabulary size (int)"
-                             "or the path to a pretrained model config file (str)")
+                             " or the path to a pretrained model config file (str)")
    @property
    def max_position_embeddings(self):
@@ -928,12 +928,16 @@ TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
    Parameters:
        config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 TRANSFO_XL_INPUTS_DOCSTRING = r"""
    Inputs:
        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
+            Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
+            the right or on the left.
            Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
@@ -1138,10 +1142,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
            else:
                mask_shift_len = qlen
            dec_attn_mask = (torch.triu(all_ones, 1+mlen)
-                    + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1
+                    + torch.tril(all_ones, -mask_shift_len)).bool()[:, :, None] # -1
        else:
            dec_attn_mask = torch.triu(
-                word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None]
+                word_emb.new_ones(qlen, klen), diagonal=1+mlen).bool()[:,:,None]
        hids = []
        attentions = []

--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -71,6 +71,10 @@ class PretrainedConfig(object):
    r""" Base class for all configuration classes.
        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
+        Note:
+            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
+            It only affects the model's configuration.
        Class attributes (overridden by derived classes):
            - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
@@ -121,6 +125,13 @@ class PretrainedConfig(object):
                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
            return_unused_kwargs: (`optional`) bool:
                - If False, then this function returns just the final configuration object.
@@ -142,6 +153,8 @@ class PretrainedConfig(object):
        """
        cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
        if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
@@ -152,8 +165,8 @@ class PretrainedConfig(object):
            config_file = pretrained_model_name_or_path
        # redirect to the cache, if necessary
        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
-        except EnvironmentError:
+        except EnvironmentError as e:
            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
                logger.error(
                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
@@ -166,7 +179,7 @@ class PretrainedConfig(object):
                        pretrained_model_name_or_path,
                        ', '.join(cls.pretrained_config_archive_map.keys()),
                        config_file))
-            return None
+            raise e
        if resolved_config_file == config_file:
            logger.info("loading configuration file {}".format(config_file))
        else:
@@ -396,6 +409,13 @@ class PreTrainedModel(nn.Module):
                Path to a directory in which a downloaded pre-trained model
                configuration should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
            output_loading_info: (`optional`) boolean:
                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
@@ -420,6 +440,8 @@ class PreTrainedModel(nn.Module):
        state_dict = kwargs.pop('state_dict', None)
        cache_dir = kwargs.pop('cache_dir', None)
        from_tf = kwargs.pop('from_tf', False)
+        force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
        output_loading_info = kwargs.pop('output_loading_info', False)
        # Load config
@@ -427,6 +449,7 @@ class PreTrainedModel(nn.Module):
            config, model_kwargs = cls.config_class.from_pretrained(
                pretrained_model_name_or_path, *model_args,
                cache_dir=cache_dir, return_unused_kwargs=True,
+                force_download=force_download,
                **kwargs
            )
        else:
@@ -449,8 +472,8 @@ class PreTrainedModel(nn.Module):
                archive_file = pretrained_model_name_or_path
        # redirect to the cache, if necessary
        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
-        except EnvironmentError:
+        except EnvironmentError as e:
            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
                logger.error(
                    "Couldn't reach server at '{}' to download pretrained weights.".format(
@@ -463,7 +486,7 @@ class PreTrainedModel(nn.Module):
                        pretrained_model_name_or_path,
                        ', '.join(cls.pretrained_model_archive_map.keys()),
                        archive_file))
-            return None
+            raise e
        if resolved_archive_file == archive_file:
            logger.info("loading weights file {}".format(archive_file))
        else:

--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -178,7 +178,7 @@ class XLMConfig(PretrainedConfig):
            self.end_n_top = end_n_top
        else:
            raise ValueError("First argument must be either a vocabulary size (int)"
-                             "or the path to a pretrained model config file (str)")
+                             " or the path to a pretrained model config file (str)")
    @property
    def vocab_size(self):
@@ -416,12 +416,18 @@ XLM_START_DOCSTRING = r"""    The XLM model was proposed in
    Parameters:
        config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 XLM_INPUTS_DOCSTRING = r"""
    Inputs:
        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
+            XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
            Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
@@ -434,8 +440,10 @@ XLM_INPUTS_DOCSTRING = r"""
            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
        **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            A parallel sequence of tokens to be used to indicate the language of each token in the input.
-            Indices are selected in the pre-trained language vocabulary,
+            Indices are languages ids which can be obtained from the language names by using two conversion mappings
-            i.e. in the range ``[0, config.n_langs - 1[``.
+            provided in the configuration of the model (only provided for multilingual models).
+            More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
+            the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:

--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -306,7 +306,7 @@ class XLNetConfig(PretrainedConfig):
            self.end_n_top = end_n_top
        else:
            raise ValueError("First argument must be either a vocabulary size (int)"
-                             "or the path to a pretrained model config file (str)")
+                             " or the path to a pretrained model config file (str)")
    @property
    def max_position_embeddings(self):
@@ -647,12 +647,16 @@ XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
    Parameters:
        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 XLNET_INPUTS_DOCSTRING = r"""
    Inputs:
        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            Indices of input sequence tokens in the vocabulary.
+            XLNet is a model with relative position embeddings so you can either pad the inputs on
+            the right or on the left.
            Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
@@ -673,8 +677,11 @@ XLNET_INPUTS_DOCSTRING = r"""
            ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
        **mems**: (`optional`)
            list of ``torch.FloatTensor`` (one for each layer):
-            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
            (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
+            To activate mems you need to set up config.mem_len to a positive value which will be the max number of tokens in
+            the memory output by the model. E.g. `model = XLNetModel.from_pretrained('xlnet-base-case, mem_len=1024)` will
+            instantiate a model which can use up to 1024 tokens of memory (in addition to the input it self).
        **perm_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, sequence_length)``:
            Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
            If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
@@ -701,7 +708,8 @@ class XLNetModel(XLNetPreTrainedModel):
        **mems**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
@@ -855,7 +863,7 @@ class XLNetModel(XLNetPreTrainedModel):
        target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
        qlen, bsz = input_ids.shape[0], input_ids.shape[1]
-        mlen = mems[0].shape[0] if mems is not None else 0
+        mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
        klen = mlen + qlen
        dtype_float = next(self.parameters()).dtype
@@ -1007,7 +1015,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        **mems**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
@@ -1087,7 +1096,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
        **mems**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
@@ -1229,7 +1239,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
        **mems**:
            list of ``torch.FloatTensor`` (one for each layer):
            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
-            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:

--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -21,6 +21,7 @@ import os
 import shutil
 import json
 import random
+import uuid
 import unittest
 import logging
@@ -48,6 +49,7 @@ class CommonTestCases:
        test_torchscript = True
        test_pruning = True
        test_resize_embeddings = True
+        test_head_masking = True
        def test_initialization(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -158,6 +160,9 @@ class CommonTestCases:
        def test_headmasking(self):
+            if not self.test_head_masking:
+                return
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            config.output_attentions = True
@@ -281,6 +286,9 @@ class CommonTestCases:
                self.assertTrue(models_equal)
        def test_tie_model_weights(self):
+            if not self.test_torchscript:
+                return
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
            def check_same_values(layer_1, layer_2):
@@ -527,7 +535,7 @@ class ConfigTester(object):
    def create_and_test_config_to_json_file(self):
        config_first = self.config_class(**self.inputs_dict)
-        json_file_path = "/tmp/config.json"
+        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
        config_first.to_json_file(json_file_path)
        config_second = self.config_class.from_json_file(json_file_path)
        os.remove(json_file_path)

--- a/pytorch_transformers/tests/modeling_dilbert_test.py
+++ b/pytorch_transformers/tests/modeling_dilbert_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import pytest
+from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
+                                  DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+from pytorch_transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
+class DistilBertModelTest(CommonTestCases.CommonModelTester):
+    all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
+                         DistilBertForSequenceClassification)
+    test_pruning = True
+    test_torchscript = True
+    test_resize_embeddings = True
+    test_head_masking = True
+    class DistilBertModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=False,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            config = DistilBertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                dim=self.hidden_size,
+                n_layers=self.num_hidden_layers,
+                n_heads=self.num_attention_heads,
+                hidden_dim=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                dropout=self.hidden_dropout_prob,
+                attention_dropout=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                initializer_range=self.initializer_range)
+            return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = DistilBertModel(config=config)
+            model.eval()
+            (sequence_output,) = model(input_ids, input_mask)
+            (sequence_output,) = model(input_ids)
+            result = {
+                "sequence_output": sequence_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = DistilBertForMaskedLM(config=config)
+            model.eval()
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = DistilBertForQuestionAnswering(config=config)
+            model.eval()
+            loss, start_logits, end_logits = model(input_ids, input_mask, sequence_labels, sequence_labels)
+            result = {
+                "loss": loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.check_loss_output(result)
+        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = DistilBertForSequenceClassification(config)
+            model.eval()
+            loss, logits = model(input_ids, input_mask, sequence_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_labels])
+            self.check_loss_output(result)
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = DistilBertModelTest.DistilBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=DistilBertConfig, dim=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_distilbert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_model(*config_and_inputs)
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_masked_lm(*config_and_inputs)
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_question_answering(*config_and_inputs)
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
+    # @pytest.mark.slow
+    # def test_model_from_pretrained(self):
+    #     cache_dir = "/tmp/pytorch_transformers_test/"
+    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+    #         shutil.rmtree(cache_dir)
+    #         self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -42,7 +42,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
    def get_tokenizer(self):
-        return BertTokenizer.from_pretrained(self.tmpdirname)
+        return self.tokenizer_class.from_pretrained(self.tmpdirname)
    def get_input_output_texts(self):
        input_text = u"UNwant\u00E9d,running"
@@ -50,7 +50,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
        return input_text, output_text
    def test_full_tokenizer(self):
-        tokenizer = BertTokenizer(self.vocab_file)
+        tokenizer = self.tokenizer_class(self.vocab_file)
        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
@@ -126,7 +126,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
        self.assertFalse(_is_punctuation(u" "))
    def test_sequence_builders(self):
-        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
        text = tokenizer.encode("sequence builders")
        text_2 = tokenizer.encode("multi-sequence build")