Merge remote-tracking branch 'refs/remotes/huggingface/master'

40ed7172 · erenup · 86a63070 · 7296f101 · 40ed7172 · 40ed7172
Commit 40ed7172 authored Dec 13, 2019 by erenup
20 changed files
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F

 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME,
+                         cached_path, hf_bucket_url, is_remote_url)

 logger = logging.getLogger(__name__)

@@ -53,7 +54,7 @@ class PreTrainedModel(nn.Module):
    r""" Base class for all models.

        :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
-        as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
+        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.

        Class attributes (overridden by derived classes):
            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
@@ -83,55 +84,59 @@ class PreTrainedModel(nn.Module):
        # Save config in model
        self.config = config

-    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
-        """ Build a resized Embedding Module from a provided token Embedding Module.
-            Increasing the size will add newly initialized vectors at the end
-            Reducing the size will remove vectors from the end
+    @property
+    def base_model(self):
+        return getattr(self, self.base_model_prefix, self)

-        Args:
-            new_num_tokens: (`optional`) int
-                New number of tokens in the embedding matrix.
-                Increasing the size will add newly initialized vectors at the end
-                Reducing the size will remove vectors from the end
-                If not provided or None: return the provided token Embedding Module.
-        Return: ``torch.nn.Embeddings``
-            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
+    def get_input_embeddings(self):
+        """ Get model's input embeddings
        """
-        if new_num_tokens is None:
-            return old_embeddings
-
-        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
-        if old_num_tokens == new_num_tokens:
-            return old_embeddings
-
-        # Build new embeddings
-        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
-        new_embeddings.to(old_embeddings.weight.device)
+        base_model = getattr(self, self.base_model_prefix, self)
+        if base_model is not self:
+            return base_model.get_input_embeddings()
+        else:
+            raise NotImplementedError

-        # initialize all new embeddings (in particular added tokens)
-        self._init_weights(new_embeddings)
+    def set_input_embeddings(self, value):
+        """ Set model's input embeddings
+        """
+        base_model = getattr(self, self.base_model_prefix, self)
+        if base_model is not self:
+            base_model.set_input_embeddings(value)
+        else:
+            raise NotImplementedError

-        # Copy word embeddings from the previous weights
-        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+    def get_output_embeddings(self):
+        """ Get model's output embeddings
+            Return None if the model doesn't have output embeddings
+        """
+        return None  # Overwrite for models with output embeddings

-        return new_embeddings
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        output_embeddings = self.get_output_embeddings()
+        if output_embeddings is not None:
+            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())

-    def _tie_or_clone_weights(self, first_module, second_module):
+    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
        """ Tie or clone module weights depending of weither we are using TorchScript or not
        """
        if self.config.torchscript:
-            first_module.weight = nn.Parameter(second_module.weight.clone())
+            output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
        else:
-            first_module.weight = second_module.weight
+            output_embeddings.weight = input_embeddings.weight

-        if hasattr(first_module, 'bias') and first_module.bias is not None:
-            first_module.bias.data = torch.nn.functional.pad(
-                first_module.bias.data,
-                (0, first_module.weight.shape[0] - first_module.bias.shape[0]),
+        if hasattr(output_embeddings, 'bias') and output_embeddings.bias is not None:
+            output_embeddings.bias.data = torch.nn.functional.pad(
+                output_embeddings.bias.data,
+                (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0]),
                'constant',
                0
            )
+        if hasattr(output_embeddings, 'out_features') and hasattr(input_embeddings, 'num_embeddings'):
+            output_embeddings.out_features = input_embeddings.num_embeddings

    def resize_token_embeddings(self, new_num_tokens=None):
        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
@@ -161,6 +166,46 @@ class PreTrainedModel(nn.Module):

        return model_embeds

+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.get_input_embeddings()
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.set_input_embeddings(new_embeddings)
+        return self.get_input_embeddings()
+
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
+        """ Build a resized Embedding Module from a provided token Embedding Module.
+            Increasing the size will add newly initialized vectors at the end
+            Reducing the size will remove vectors from the end
+
+        Args:
+            new_num_tokens: (`optional`) int
+                New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+                If not provided or None: return the provided token Embedding Module.
+        Return: ``torch.nn.Embeddings``
+            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
+        """
+        if new_num_tokens is None:
+            return old_embeddings
+
+        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        if old_num_tokens == new_num_tokens:
+            return old_embeddings
+
+        # Build new embeddings
+        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
+        new_embeddings.to(old_embeddings.weight.device)
+
+        # initialize all new embeddings (in particular added tokens)
+        self._init_weights(new_embeddings)
+
+        # Copy word embeddings from the previous weights
+        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+
+        return new_embeddings
+
    def init_weights(self):
        """ Initialize and prunes weights if needed. """
        # Initialize weights
@@ -170,6 +215,9 @@ class PreTrainedModel(nn.Module):
        if self.config.pruned_heads:
            self.prune_heads(self.config.pruned_heads)

+        # Tie weights if needed
+        self.tie_weights()
+
    def prune_heads(self, heads_to_prune):
        """ Prunes heads of the base model.

@@ -178,14 +226,12 @@ class PreTrainedModel(nn.Module):
                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
                E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
        """
-        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
-
        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
        for layer, heads in heads_to_prune.items():
            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
            self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON

-        base_model._prune_heads(heads_to_prune)
+        self.base_model._prune_heads(heads_to_prune)

    def save_pretrained(self, save_directory):
        """ Save a model and its configuration file to a directory, so that it
@@ -193,7 +239,7 @@ class PreTrainedModel(nn.Module):
        """
        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"

-        # Only save the model it-self if we are using distributed training
+        # Only save the model itself if we are using distributed training
        model_to_save = self.module if hasattr(self, 'module') else self

        # Save configuration file
@@ -220,6 +266,7 @@ class PreTrainedModel(nn.Module):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
@@ -246,6 +293,9 @@ class PreTrainedModel(nn.Module):
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.

+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
@@ -270,11 +320,17 @@ class PreTrainedModel(nn.Module):
            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
+        if pretrained_model_name_or_path is not None and (
+                "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path):
+            logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
+                           "https://github.com/google-research/google-research/issues/119 for more information.")
+
        config = kwargs.pop('config', None)
        state_dict = kwargs.pop('state_dict', None)
        cache_dir = kwargs.pop('cache_dir', None)
        from_tf = kwargs.pop('from_tf', False)
        force_download = kwargs.pop('force_download', False)
+        resume_download = kwargs.pop('resume_download', False)
        proxies = kwargs.pop('proxies', None)
        output_loading_info = kwargs.pop('output_loading_info', False)

@@ -284,6 +340,8 @@ class PreTrainedModel(nn.Module):
                pretrained_model_name_or_path, *model_args,
                cache_dir=cache_dir, return_unused_kwargs=True,
                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
                **kwargs
            )
        else:
@@ -307,15 +365,21 @@ class PreTrainedModel(nn.Module):
                    raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
                        pretrained_model_name_or_path))
-            elif os.path.isfile(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                archive_file = pretrained_model_name_or_path
-            else:
-                assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index")
                archive_file = pretrained_model_name_or_path + ".index"
+            else:
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
+                if from_tf:
+                    raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")

            # redirect to the cache, if necessary
            try:
-                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
+                                                    proxies=proxies, resume_download=resume_download)
            except EnvironmentError:
                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
                    msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
@@ -371,6 +435,8 @@ class PreTrainedModel(nn.Module):
                    new_key = key.replace('gamma', 'weight')
                if 'beta' in key:
                    new_key = key.replace('beta', 'bias')
+                if key == 'lm_head.decoder.weight':
+                    new_key = 'lm_head.weight'
                if new_key:
                    old_keys.append(key)
                    new_keys.append(new_key)
@@ -383,6 +449,8 @@ class PreTrainedModel(nn.Module):
            if metadata is not None:
                state_dict._metadata = metadata

+            # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+            # so we need to apply the function recursively.
            def load(module, prefix=''):
                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
                module._load_from_state_dict(
@@ -680,7 +748,7 @@ class SequenceSummary(nn.Module):
    def __init__(self, config):
        super(SequenceSummary, self).__init__()

-        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
+        self.summary_type = config.summary_type if hasattr(config, 'summary_type') else 'last'
        if self.summary_type == 'attn':
            # We should use a standard multi-head attention module with absolute positional embedding for that.
            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276

--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -73,15 +73,15 @@ def get_masks(slen, lengths, causal, padding_mask=None):
    """
    Generate hidden states mask, and optionally an attention mask.
    """
-    bs = lengths.size(0)
+    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
    if padding_mask is not None:
        mask = padding_mask
    else:
        assert lengths.max().item() <= slen
-        alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
        mask = alen < lengths[:, None]

    # attention mask is the same as mask, or triangular inferior attention (causal)
+    bs = lengths.size(0)
    if causal:
        attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
    else:
@@ -311,6 +311,10 @@ XLM_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """

 @add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
@@ -407,10 +411,12 @@ class XLMModel(XLMPreTrainedModel):

        self.init_weights()

-    def _resize_token_embeddings(self, new_num_tokens):
-        self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
+    def get_input_embeddings(self):
        return self.embeddings

+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
@@ -419,14 +425,21 @@ class XLMModel(XLMPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.attentions[layer].prune_heads(heads)

-    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None):  # removed: src_enc=None, src_len=None
+    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None, inputs_embeds=None):  # removed: src_enc=None, src_len=None
+        if input_ids is not None:
+            bs, slen = input_ids.size()
+        else:
+            bs, slen = inputs_embeds.size()[:-1]
+
        if lengths is None:
-            lengths = (input_ids != self.pad_index).sum(dim=1).long()
+            if input_ids is not None:
+                lengths = (input_ids != self.pad_index).sum(dim=1).long()
+            else:
+                lengths = torch.LongTensor([slen]*bs)
        # mask = input_ids != self.pad_index

        # check inputs
-        bs, slen = input_ids.size()
        assert lengths.size(0) == bs
        assert lengths.max().item() <= slen
        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
@@ -440,10 +453,12 @@ class XLMModel(XLMPreTrainedModel):
        # if self.is_decoder and src_enc is not None:
        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]

+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
        # position_ids
        if position_ids is None:
-            position_ids = input_ids.new((slen,)).long()
-            position_ids = torch.arange(slen, out=position_ids).unsqueeze(0)
+            position_ids = torch.arange(slen, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).expand((bs, slen))
        else:
            assert position_ids.size() == (bs, slen)  # (slen, bs)
            # position_ids = position_ids.transpose(0, 1)
@@ -469,7 +484,7 @@ class XLMModel(XLMPreTrainedModel):
            head_mask = [None] * self.n_layers

        # do not recompute cached elements
-        if cache is not None:
+        if cache is not None and input_ids is not None:
            _slen = slen - cache['slen']
            input_ids = input_ids[:, -_slen:]
            position_ids = position_ids[:, -_slen:]
@@ -479,8 +494,10 @@ class XLMModel(XLMPreTrainedModel):
            attn_mask = attn_mask[:, -_slen:]

        # embeddings
-        tensor = self.embeddings(input_ids)
-        tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
        if langs is not None and self.use_lang_emb:
            tensor = tensor + self.lang_embeddings(langs)
        if token_type_ids is not None:
@@ -618,15 +635,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
        self.pred_layer = XLMPredLayer(config)

        self.init_weights()
-        self.tie_weights()

-    def tie_weights(self):
-        """ Make sure we are sharing the embeddings
-        """
-        self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
+    def get_output_embeddings(self):
+        return self.pred_layer.proj

-    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, labels=None):
+    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               langs=langs,
@@ -634,7 +648,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
                                               position_ids=position_ids,
                                               lengths=lengths, 
                                               cache=cache,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)

        output = transformer_outputs[0]
        outputs = self.pred_layer(output, labels)
@@ -686,8 +701,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, labels=None):
+    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               langs=langs,
@@ -695,7 +710,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
                                               position_ids=position_ids,
                                               lengths=lengths, 
                                               cache=cache,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)

        output = transformer_outputs[0]
        logits = self.sequence_summary(output)
@@ -769,8 +785,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None):
+    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               langs=langs,
@@ -778,7 +794,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
                                               position_ids=position_ids,
                                               lengths=lengths, 
                                               cache=cache,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)

        sequence_output = transformer_outputs[0]

@@ -864,8 +881,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None,
+    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None,
                is_impossible=None, cls_index=None, p_mask=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
@@ -874,7 +891,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
                                               position_ids=position_ids,
                                               lengths=lengths, 
                                               cache=cache,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)

        output = transformer_outputs[0]


--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -558,6 +558,10 @@ XLNET_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """

 @add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
@@ -579,6 +583,7 @@ class XLNetModel(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.

    Examples::

@@ -611,10 +616,12 @@ class XLNetModel(XLNetPreTrainedModel):

        self.init_weights()

-    def _resize_token_embeddings(self, new_num_tokens):
-        self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
+    def get_input_embeddings(self):
        return self.word_embedding

+    def set_input_embeddings(self, new_embeddings):
+        self.word_embedding = new_embeddings
+
    def _prune_heads(self, heads_to_prune):
        raise NotImplementedError

@@ -710,19 +717,29 @@ class XLNetModel(XLNetPreTrainedModel):
        pos_emb = pos_emb.to(next(self.parameters()))
        return pos_emb

-    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None):
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None):
        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
        # but we want a unified interface in the library with the batch size on the first dimension
        # so we move here the first dimension (batch) to the end
-        input_ids = input_ids.transpose(0, 1).contiguous()
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = input_ids.transpose(0, 1).contiguous()
+            qlen, bsz = input_ids.shape[0], input_ids.shape[1]
+        elif inputs_embeds is not None:
+            inputs_embeds.transpose(0, 1).contiguous()
+            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
        token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
        input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
        attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
        perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
        target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None

-        qlen, bsz = input_ids.shape[0], input_ids.shape[1]
+
        mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
        klen = mlen + qlen

@@ -775,7 +792,10 @@ class XLNetModel(XLNetPreTrainedModel):
            non_tgt_mask = None

        ##### Word embeddings and prepare h & g hidden states
-        word_emb_k = self.word_embedding(input_ids)
+        if inputs_embeds is not None:
+            word_emb_k = inputs_embeds
+        else:
+            word_emb_k = self.word_embedding(input_ids)
        output_h = self.dropout(word_emb_k)
        if target_mapping is not None:
            word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
@@ -859,7 +879,11 @@ class XLNetModel(XLNetPreTrainedModel):
                hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
            outputs = outputs + (hidden_states,)
        if self.output_attentions:
-            attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
+            if target_mapping is not None:
+                # when target_mapping is provided, there are 2-tuple of attentions
+                attentions = tuple(tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions)
+            else:
+                attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
            outputs = outputs + (attentions,)

        return outputs  # outputs, (new_mems), (hidden_states), (attentions)
@@ -894,6 +918,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.

    Examples::

@@ -918,15 +943,12 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)

        self.init_weights()
-        self.tie_weights()

-    def tie_weights(self):
-        """ Make sure we are sharing the embeddings
-        """
-        self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
+    def get_output_embeddings(self):
+        return self.lm_loss

-    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, labels=None):
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               mems=mems,
@@ -934,7 +956,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
                                               target_mapping=target_mapping,
                                               token_type_ids=token_type_ids,
                                               input_mask=input_mask,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)

        logits = self.lm_loss(transformer_outputs[0])

@@ -978,6 +1001,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.

    Examples::

@@ -999,8 +1023,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, labels=None):
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               mems=mems,
@@ -1008,7 +1032,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                                               target_mapping=target_mapping,
                                               token_type_ids=token_type_ids,
                                               input_mask=input_mask,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)
        output = transformer_outputs[0]

        output = self.sequence_summary(output)
@@ -1028,6 +1053,106 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):

        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)

+@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
+                      the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+                      XLNET_START_DOCSTRING,
+                      XLNET_INPUTS_DOCSTRING)
+class XLNetForTokenClassification(XLNetPreTrainedModel):
+    r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The second dimension of the input (`num_choices`) indicates the number of choices to scores.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **mems**: (`optional`, returned when ``config.mem_len > 0``)
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        scores = outputs[0]
+
+    """
+    def __init__(self, config):
+        super(XLNetForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XLNetModel(config)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
+
+        outputs = self.transformer(input_ids,
+                            attention_mask=attention_mask,
+                            mems=mems,
+                            perm_mask=perm_mask,
+                            target_mapping=target_mapping,
+                            token_type_ids=token_type_ids,
+                            input_mask=input_mask,
+                            head_mask=head_mask,
+                            inputs_embeds=inputs_embeds)
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
+
+
 @add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
@@ -1050,6 +1175,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
@@ -1073,6 +1202,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.

    Examples::

@@ -1094,9 +1224,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
+    def forward(self, input_ids=None, token_type_ids=None, input_mask=None, attention_mask=None,
                mems=None, perm_mask=None, target_mapping=None,
-                labels=None, head_mask=None):
+                labels=None, head_mask=None, inputs_embeds=None):
        num_choices = input_ids.shape[1]

        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -1107,7 +1237,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
        transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids,
                                               input_mask=flat_input_mask, attention_mask=flat_attention_mask,
                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask, inputs_embeds=inputs_embeds)


        output = transformer_outputs[0]
@@ -1158,6 +1288,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.

    Examples::

@@ -1179,8 +1310,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None,
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
                start_positions=None, end_positions=None):

        outputs = self.transformer(input_ids,
@@ -1190,7 +1321,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
                                    target_mapping=target_mapping,
                                    token_type_ids=token_type_ids,
                                    input_mask=input_mask,
-                                    head_mask=head_mask)
+                                    head_mask=head_mask,
+                                    inputs_embeds=inputs_embeds)

        sequence_output = outputs[0]

@@ -1271,6 +1403,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.

    Examples::

@@ -1295,8 +1428,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):

        self.init_weights()

-    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None,
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
                start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
@@ -1305,7 +1438,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
                                               target_mapping=target_mapping,
                                               token_type_ids=token_type_ids,
                                               input_mask=input_mask,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)
        hidden_states = transformer_outputs[0]
        start_logits = self.start_logits(hidden_states, p_mask=p_mask)


--- a/transformers/optimization.py
+++ b/transformers/optimization.py
@@ -23,86 +23,66 @@ from torch.optim.lr_scheduler import LambdaLR

 logger = logging.getLogger(__name__)

-class ConstantLRSchedule(LambdaLR):
-    """ Constant learning rate schedule.
+
+def get_constant_schedule(optimizer, last_epoch=-1):
+    """ Create a schedule with a constant learning rate.
    """
-    def __init__(self, optimizer, last_epoch=-1):
-        super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
+    return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)


-class WarmupConstantSchedule(LambdaLR):
-    """ Linear warmup and then constant.
-        Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
-        Keeps learning rate schedule equal to 1. after warmup_steps.
+def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
+    """ Create a schedule with a constant learning rate preceded by a warmup
+    period during which the learning rate increases linearly between 0 and 1.
    """
-    def __init__(self, optimizer, warmup_steps, last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
-
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1.0, self.warmup_steps))
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1.0, num_warmup_steps))
        return 1.

+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)

-class WarmupLinearSchedule(LambdaLR):
-    """ Linear warmup and then linear decay.
-        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
-        Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
-    """
-    def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        self.t_total = t_total
-        super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
-
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1, self.warmup_steps))
-        return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
-
-
-class WarmupCosineSchedule(LambdaLR):
-    """ Linear warmup and then cosine decay.
-        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
-        Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
-        If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+
+def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
+    """ Create a schedule with a learning rate that decreases linearly after
+    linearly increasing during a warmup period.
    """
-    def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        self.t_total = t_total
-        self.cycles = cycles
-        super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
-
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1.0, self.warmup_steps))
-        # progress after warmup
-        progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
-        return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
-
-
-class WarmupCosineWithHardRestartsSchedule(LambdaLR):
-    """ Linear warmup and then cosine cycles with hard restarts.
-        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
-        If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
-        learning rate (with hard restarts).
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
+    """ Create a schedule with a learning rate that decreases following the
+    values of the cosine function between 0 and `pi * cycles` after a warmup
+    period during which it increases linearly between 0 and 1.
    """
-    def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        self.t_total = t_total
-        self.cycles = cycles
-        super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
-
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1, self.warmup_steps))
-        # progress after warmup
-        progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
-        if progress >= 1.0:
-            return 0.0
-        return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)


+def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1):
+    """ Create a schedule with a learning rate that decreases following the
+    values of the cosine function with several hard restarts, after a warmup
+    period during which it increases linearly between 0 and 1.
+    """
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        if progress >= 1.:
+            return 0.
+        return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.))))
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+

 class AdamW(Optimizer):
    """ Implements Adam algorithm with weight decay fix.

--- a/transformers/optimization_tf.py
+++ b/transformers/optimization_tf.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions and classes related to optimization (weight updates)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+import tensorflow as tf
+
+
+class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applys a warmup schedule on a given learning rate decay schedule."""
+
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_schedule_fn,
+      warmup_steps,
+      power=1.0,
+      name=None):
+    super(WarmUp, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.warmup_steps = warmup_steps
+    self.power = power
+    self.decay_schedule_fn = decay_schedule_fn
+    self.name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self.name or 'WarmUp') as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+      warmup_percent_done = global_step_float / warmup_steps_float
+      warmup_learning_rate = (
+          self.initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self.power))
+      return tf.cond(global_step_float < warmup_steps_float,
+                     lambda: warmup_learning_rate,
+                     lambda: self.decay_schedule_fn(step),
+                     name=name)
+
+  def get_config(self):
+    return {
+        'initial_learning_rate': self.initial_learning_rate,
+        'decay_schedule_fn': self.decay_schedule_fn,
+        'warmup_steps': self.warmup_steps,
+        'power': self.power,
+        'name': self.name
+    }
+
+
+def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
+  """Creates an optimizer with learning rate schedule."""
+  # Implements linear decay of the learning rate.
+  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+      initial_learning_rate=init_lr,
+      decay_steps=num_train_steps,
+      end_learning_rate=0.0)
+  if num_warmup_steps:
+    learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
+                              decay_schedule_fn=learning_rate_fn,
+                              warmup_steps=num_warmup_steps)
+  optimizer = AdamWeightDecay(
+      learning_rate=learning_rate_fn,
+      weight_decay_rate=0.01,
+      beta_1=0.9,
+      beta_2=0.999,
+      epsilon=1e-6,
+      exclude_from_weight_decay=['layer_norm', 'bias'])
+  return optimizer
+
+
+class AdamWeightDecay(tf.keras.optimizers.Adam):
+  """Adam enables L2 weight decay and clip_by_global_norm on gradients.
+
+  Just adding the square of the weights to the loss function is *not* the
+  correct way of using L2 regularization/weight decay with Adam, since that will
+  interact with the m and v parameters in strange ways.
+
+  Instead we want ot decay the weights in a manner that doesn't interact with
+  the m/v parameters. This is equivalent to adding the square of the weights to
+  the loss with plain (non-momentum) SGD.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               amsgrad=False,
+               weight_decay_rate=0.0,
+               include_in_weight_decay=None,
+               exclude_from_weight_decay=None,
+               name='AdamWeightDecay',
+               **kwargs):
+    super(AdamWeightDecay, self).__init__(
+        learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
+    self.weight_decay_rate = weight_decay_rate
+    self._include_in_weight_decay = include_in_weight_decay
+    self._exclude_from_weight_decay = exclude_from_weight_decay
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates an optimizer from its config with WarmUp custom object."""
+    custom_objects = {'WarmUp': WarmUp}
+    return super(AdamWeightDecay, cls).from_config(
+        config, custom_objects=custom_objects)
+
+  def _prepare_local(self, var_device, var_dtype, apply_state):
+    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
+                                                apply_state)
+    apply_state['weight_decay_rate'] = tf.constant(
+        self.weight_decay_rate, name='adam_weight_decay_rate')
+
+  def _decay_weights_op(self, var, learning_rate, apply_state):
+    do_decay = self._do_use_weight_decay(var.name)
+    if do_decay:
+      return var.assign_sub(
+          learning_rate * var *
+          apply_state['weight_decay_rate'],
+          use_locking=self._use_locking)
+    return tf.no_op()
+
+  def apply_gradients(self, grads_and_vars, clip_norm, name=None):
+    grads, tvars = list(zip(*grads_and_vars))
+    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
+    return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
+
+  def _get_lr(self, var_device, var_dtype, apply_state):
+    """Retrieves the learning rate with the given state."""
+    if apply_state is None:
+      return self._decayed_lr_t[var_dtype], {}
+
+    apply_state = apply_state or {}
+    coefficients = apply_state.get((var_device, var_dtype))
+    if coefficients is None:
+      coefficients = self._fallback_apply_state(var_device, var_dtype)
+      apply_state[(var_device, var_dtype)] = coefficients
+
+    return coefficients['lr_t'], dict(apply_state=apply_state)
+
+  def _resource_apply_dense(self, grad, var, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      return super(AdamWeightDecay, self)._resource_apply_dense(
+          grad, var, **kwargs)
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      return super(AdamWeightDecay, self)._resource_apply_sparse(
+          grad, var, indices, **kwargs)
+
+  def get_config(self):
+    config = super(AdamWeightDecay, self).get_config()
+    config.update({
+        'weight_decay_rate': self.weight_decay_rate,
+    })
+    return config
+
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if self.weight_decay_rate == 0:
+      return False
+
+    if self._include_in_weight_decay:
+      for r in self._include_in_weight_decay:
+        if re.search(r, param_name) is not None:
+          return True
+
+    if self._exclude_from_weight_decay:
+      for r in self._exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+
+## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
+class GradientAccumulator(object):
+    """Distribution strategies-aware gradient accumulation utility."""
+
+    def __init__(self):
+        """Initializes the accumulator."""
+        self._gradients = []
+        self._accum_steps = tf.Variable(
+            initial_value=0,
+            dtype=tf.int64,
+            trainable=False,
+            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+
+    @property
+    def step(self):
+        """Number of accumulated steps."""
+        return self._accum_steps.value()
+
+    @property
+    def gradients(self):
+        """The accumulated gradients."""
+        return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
+
+    def __call__(self, gradients):
+        """Accumulates :obj:`gradients`."""
+        if not self._gradients:
+            self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
+
+        if len(gradients) != len(self._gradients):
+            raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
+
+        for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
+            if accum_gradient is not None:
+                accum_gradient.assign_add(gradient)
+
+        self._accum_steps.assign_add(1)
+
+    def reset(self):
+        """Resets the accumulated gradients."""
+        if self._gradients:
+            self._accum_steps.assign(0)
+
+        for gradient in self._get_replica_gradients():
+            if gradient is not None:
+                gradient.assign(tf.zeros_like(gradient))
+
+    def _get_replica_gradients(self):
+        if tf.distribute.has_strategy():
+            # In a replica context, we want to accumulate gradients on each replica
+            # without synchronization, so we directly assign the value of the
+            # current replica.
+            replica_context = tf.distribute.get_replica_context()
+
+            if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
+                return self._gradients
+
+            return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
+        else:
+            return self._gradients
--- a/transformers/tests/conftest.py
+++ b/transformers/tests/conftest.py
-# content of conftest.py
-
-import pytest
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--runslow", action="store_true", default=False, help="run slow tests"
-    )
-
-
-def pytest_collection_modifyitems(config, items):
-    if config.getoption("--runslow"):
-        # --runslow given in cli: do not skip slow tests
-        return
-    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
-    for item in items:
-        if "slow" in item.keywords:
-            item.add_marker(skip_slow)
--- a/transformers/tests/fixtures/spiece.model
+++ b/transformers/tests/fixtures/spiece.model
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import os
+import six
+import time
+import unittest
+
+from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
+
+USER = "__DUMMY_TRANSFORMERS_USER__"
+PASS = "__DUMMY_TRANSFORMERS_PASS__"
+FILE_KEY = "Test-{}.txt".format(int(time.time()))
+FILE_PATH = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
+)
+
+
+
+class HfApiCommonTest(unittest.TestCase):
+    _api = HfApi(endpoint="https://moon-staging.huggingface.co")
+
+
+class HfApiLoginTest(HfApiCommonTest):
+    def test_login_invalid(self):
+        with self.assertRaises(HTTPError):
+            self._api.login(username=USER, password="fake")
+
+    def test_login_valid(self):
+        token = self._api.login(username=USER, password=PASS)
+        self.assertIsInstance(token, six.string_types)
+
+
+class HfApiEndpointsTest(HfApiCommonTest):
+    @classmethod
+    def setUpClass(cls):
+        """
+        Share this valid token in all tests below.
+        """
+        cls._token = cls._api.login(username=USER, password=PASS)
+
+    def test_whoami(self):
+        user = self._api.whoami(token=self._token)
+        self.assertEqual(user, USER)
+
+    def test_presign(self):
+        urls = self._api.presign(token=self._token, filename=FILE_KEY)
+        self.assertIsInstance(urls, PresignedUrl)
+        self.assertEqual(urls.type, "text/plain")
+
+    def test_presign_and_upload(self):
+        access_url = self._api.presign_and_upload(
+            token=self._token, filename=FILE_KEY, filepath=FILE_PATH
+        )
+        self.assertIsInstance(access_url, six.string_types)
+
+    def test_list_objs(self):
+        objs = self._api.list_objs(token=self._token)
+        self.assertIsInstance(objs, list)
+        if len(objs) > 0:
+            o = objs[-1]
+            self.assertIsInstance(o, S3Obj)
+
+
+
+class HfFolderTest(unittest.TestCase):
+    def test_token_workflow(self):
+        """
+        Test the whole token save/get/delete workflow,
+        with the desired behavior with respect to non-existent tokens.
+        """
+        token = "token-{}".format(int(time.time()))
+        HfFolder.save_token(token)
+        self.assertEqual(
+            HfFolder.get_token(),
+            token
+        )
+        HfFolder.delete_token()
+        HfFolder.delete_token()
+        # ^^ not an error, we test that the
+        # second call does not fail.
+        self.assertEqual(
+            HfFolder.get_token(),
+            None
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+
+from transformers import is_torch_available
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
+
+if is_torch_available():
+    from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
+                              AlbertForSequenceClassification, AlbertForQuestionAnswering,
+                              )
+    from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@require_torch
+class AlbertModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
+
+    class AlbertModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     embedding_size=16,
+                     hidden_size=36,
+                     num_hidden_layers=6,
+                     num_hidden_groups=6,
+                     num_attention_heads=6,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.embedding_size = embedding_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+            self.num_hidden_groups = num_hidden_groups
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = AlbertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range,
+                num_hidden_groups=self.num_hidden_groups)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = AlbertModel(config=config)
+            model.to(torch_device)
+            model.eval()
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
+            sequence_output, pooled_output = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+                "pooled_output": pooled_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+
+
+        def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = AlbertForMaskedLM(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
+        def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = AlbertForQuestionAnswering(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+            result = {
+                "loss": loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.check_loss_output(result)
+
+
+        def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = AlbertForSequenceClassification(config)
+            model.to(torch_device)
+            model.eval()
+            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_labels])
+            self.check_loss_output(result)
+
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = AlbertModelTest.AlbertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_albert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = AlbertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -18,11 +18,12 @@ from __future__ import print_function

 import unittest
 import shutil
-import pytest
 import logging

 from transformers import is_torch_available

+from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
+
 if is_torch_available():
    from transformers import (AutoConfig, BertConfig,
                                    AutoModel, BertModel,
@@ -33,11 +34,11 @@ if is_torch_available():

    from .modeling_common_test import (CommonTestCases, ids_tensor)
    from .configuration_common_test import ConfigTester
-else:
-    pytestmark = pytest.mark.skip("Require Torch")


+@require_torch
 class AutoModelTest(unittest.TestCase):
+    @slow
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -52,6 +53,7 @@ class AutoModelTest(unittest.TestCase):
            for value in loading_info.values():
                self.assertEqual(len(value), 0)

+    @slow
    def test_lmhead_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -64,6 +66,7 @@ class AutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForMaskedLM)

+    @slow
    def test_sequence_classification_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -76,6 +79,7 @@ class AutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForSequenceClassification)

+    @slow
    def test_question_answering_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -88,6 +92,11 @@ class AutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForQuestionAnswering)

+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, BertForMaskedLM)
+

 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -18,28 +18,27 @@ from __future__ import print_function

 import unittest
 import shutil
-import pytest

 from transformers import is_torch_available

-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device

 if is_torch_available():
    from transformers import (BertConfig, BertModel, BertForMaskedLM,
-                                        BertForNextSentencePrediction, BertForPreTraining,
-                                        BertForQuestionAnswering, BertForSequenceClassification,
-                                        BertForTokenClassification, BertForMultipleChoice)
+                              BertForNextSentencePrediction, BertForPreTraining,
+                              BertForQuestionAnswering, BertForSequenceClassification,
+                              BertForTokenClassification, BertForMultipleChoice)
    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")


+@require_torch
 class BertModelTest(CommonTestCases.CommonModelTester):

    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
-            BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-            BertForTokenClassification) if is_torch_available() else ()
+                         BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                         BertForTokenClassification) if is_torch_available() else ()

    class BertModelTester(object):

@@ -66,7 +65,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                     num_labels=3,
                     num_choices=4,
                     scope=None,
-                    ):
+                     ):
            self.parent = parent
            self.batch_size = batch_size
            self.seq_length = seq_length
@@ -120,10 +119,20 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                max_position_embeddings=self.max_position_embeddings,
                type_vocab_size=self.type_vocab_size,
+                is_decoder=False,
                initializer_range=self.initializer_range)

            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels

+        def prepare_config_and_inputs_for_decoder(self):
+            config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels = self.prepare_config_and_inputs()
+
+            config.is_decoder = True
+            encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+            encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask
+
        def check_loss_output(self, result):
            self.parent.assertListEqual(
                list(result["loss"].size()),
@@ -131,6 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertModel(config=config)
+            model.to(torch_device)
            model.eval()
            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -145,9 +155,26 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])

+        def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
+            model = BertModel(config)
+            model.to(torch_device)
+            model.eval()
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+                "pooled_output": pooled_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])

        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForMaskedLM(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
            result = {
@@ -159,8 +186,24 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.seq_length, self.vocab_size])
            self.check_loss_output(result)

+        def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
+            model = BertForMaskedLM(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForNextSentencePrediction(config=config)
+            model.to(torch_device)
            model.eval()
            loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
            result = {
@@ -172,9 +215,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, 2])
            self.check_loss_output(result)

-
        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForPreTraining(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                                    masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
@@ -191,9 +234,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, 2])
            self.check_loss_output(result)

-
        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForQuestionAnswering(config=config)
+            model.to(torch_device)
            model.eval()
            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                   start_positions=sequence_labels, end_positions=sequence_labels)
@@ -210,10 +253,10 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.seq_length])
            self.check_loss_output(result)

-
        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = BertForSequenceClassification(config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
            result = {
@@ -225,10 +268,10 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.num_labels])
            self.check_loss_output(result)

-
        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = BertForTokenClassification(config=config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
            result = {
@@ -240,10 +283,10 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.seq_length, self.num_labels])
            self.check_loss_output(result)

-
        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_choices = self.num_choices
            model = BertForMultipleChoice(config=config)
+            model.to(torch_device)
            model.eval()
            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
@@ -261,7 +304,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.num_choices])
            self.check_loss_output(result)

-
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, token_type_ids, input_mask,
@@ -280,10 +322,18 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_model(*config_and_inputs)

+    def test_bert_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_bert_model_as_decoder(*config_and_inputs)
+
    def test_for_masked_lm(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)

+    def test_for_masked_lm_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_bert_model_for_masked_lm_as_decoder(*config_and_inputs)
+
    def test_for_multiple_choice(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
@@ -308,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)

-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        cache_dir = "/tmp/transformers_test/"
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -316,5 +366,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)

+
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -27,19 +27,18 @@ import uuid

 import unittest
 import logging
-import pytest

 from transformers import is_torch_available

+from .utils import require_torch, slow, torch_device
+
 if is_torch_available():
    import torch
    import numpy as np

-    from transformers import (PretrainedConfig, PreTrainedModel,
+    from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
                                    BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                    GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")

 if sys.version_info[0] == 2:
    import cPickle as pickle
@@ -65,6 +64,7 @@ def _config_zero_init(config):

 class CommonTestCases:

+    @require_torch
    class CommonModelTester(unittest.TestCase):

        model_tester = None
@@ -79,6 +79,7 @@ class CommonTestCases:

            for model_class in self.all_model_classes:
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                with torch.no_grad():
                    outputs = model(**inputs_dict)
@@ -86,12 +87,13 @@ class CommonTestCases:
                with TemporaryDirectory() as tmpdirname:
                    model.save_pretrained(tmpdirname)
                    model = model_class.from_pretrained(tmpdirname)
+                    model.to(torch_device)
                    with torch.no_grad():
                        after_outputs = model(**inputs_dict)

                    # Make sure we don't have nans
-                    out_1 = after_outputs[0].numpy()
-                    out_2 = outputs[0].numpy()
+                    out_1 = after_outputs[0].cpu().numpy()
+                    out_2 = outputs[0].cpu().numpy()
                    out_1 = out_1[~np.isnan(out_1)]
                    out_2 = out_2[~np.isnan(out_2)]
                    max_diff = np.amax(np.abs(out_1 - out_2))
@@ -113,6 +115,7 @@ class CommonTestCases:

            for model_class in self.all_model_classes:
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
                self.assertEqual(first.ne(second).sum().item(), 0)
@@ -125,6 +128,7 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                outputs = model(**inputs_dict)
                attentions = outputs[-1]
@@ -142,6 +146,7 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                outputs = model(**inputs_dict)
                self.assertEqual(out_len+1, len(outputs))
@@ -181,6 +186,7 @@ class CommonTestCases:
            configs_no_init.torchscript = True
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
+                model.to(torch_device)
                model.eval()
                inputs = inputs_dict['input_ids']  # Let's keep only input_ids

@@ -201,7 +207,10 @@ class CommonTestCases:
                except ValueError:
                    self.fail("Couldn't load module.")

+                model.to(torch_device)
                model.eval()
+
+                loaded_model.to(torch_device)
                loaded_model.eval()

                model_params = model.parameters()
@@ -228,11 +237,12 @@ class CommonTestCases:
            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
+                model.to(torch_device)
                model.eval()

                # Prepare head_mask
                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
+                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
                head_mask[0, 0] = 0
                head_mask[-1, :-1] = 0
                head_mask.requires_grad_(requires_grad=True)
@@ -282,6 +292,7 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()
                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
@@ -310,6 +321,7 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()
                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
@@ -319,6 +331,7 @@ class CommonTestCases:
                    os.makedirs(directory)
                model.save_pretrained(directory)
                model = model_class.from_pretrained(directory)
+                model.to(torch_device)

                outputs = model(**inputs_dict)
                attentions = outputs[-1]
@@ -346,6 +359,7 @@ class CommonTestCases:
                config.pruned_heads = heads_to_prune

                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()

                outputs = model(**inputs_dict)
@@ -372,6 +386,7 @@ class CommonTestCases:
                config.pruned_heads = heads_to_prune

                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()

                outputs = model(**inputs_dict)
@@ -388,6 +403,7 @@ class CommonTestCases:
                    os.makedirs(directory)
                model.save_pretrained(directory)
                model = model_class.from_pretrained(directory)
+                model.to(torch_device)
                shutil.rmtree(directory)

                outputs = model(**inputs_dict)
@@ -419,6 +435,7 @@ class CommonTestCases:
                config.output_hidden_states = True
                config.output_attentions = False
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                outputs = model(**inputs_dict)
                hidden_states = outputs[-1]
@@ -463,6 +480,21 @@ class CommonTestCases:

                self.assertTrue(models_equal)

+        def test_model_common_attributes(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                self.assertIsInstance(
+                    model.get_input_embeddings(),
+                    (torch.nn.Embedding, AdaptiveEmbedding)
+                )
+                model.set_input_embeddings(torch.nn.Embedding(10, 10))
+                x = model.get_output_embeddings()
+                self.assertTrue(
+                    x is None or isinstance(x, torch.nn.Linear)
+                )
+
        def test_tie_model_weights(self):
            if not self.test_torchscript:
                return
@@ -477,11 +509,11 @@ class CommonTestCases:
                return equal

            for model_class in self.all_model_classes:
-                if not hasattr(model_class, 'tie_weights'):
-                    continue
-
                config.torchscript = True
                model_not_tied = model_class(config)
+                if model_not_tied.get_output_embeddings() is None:
+                    continue
+
                params_not_tied = list(model_not_tied.parameters())

                config_tied = copy.deepcopy(config)
@@ -516,6 +548,20 @@ class CommonTestCases:
                # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
                # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))

+        def test_inputs_embeds(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            input_ids = inputs_dict["input_ids"]
+            del inputs_dict["input_ids"]
+
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+
+                wte = model.get_input_embeddings()
+                inputs_dict["inputs_embeds"] = wte(input_ids)
+                outputs = model(**inputs_dict)
+

    class GPTModelTester(CommonModelTester):

@@ -600,6 +646,7 @@ class CommonTestCases:
        def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
                                mc_labels, lm_labels, mc_token_ids):
            model = self.base_model_class(config)
+            model.to(torch_device)
            model.eval()

            outputs = model(input_ids, position_ids, token_type_ids)
@@ -615,6 +662,7 @@ class CommonTestCases:
        def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_ids):
            model = self.lm_head_model_class(config)
+            model.to(torch_device)
            model.eval()
            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
            loss, lm_logits = outputs[:2]
@@ -631,6 +679,7 @@ class CommonTestCases:
                                        mc_labels, lm_labels, mc_token_ids):
            for model_class in self.all_model_classes:
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                outputs = model(input_ids)
                presents = outputs[-1]
@@ -643,6 +692,7 @@ class CommonTestCases:
        def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_ids):
            model = self.double_head_model_class(config)
+            model.to(torch_device)
            model.eval()
            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                            token_type_ids=token_type_ids, position_ids=position_ids)
@@ -688,6 +738,7 @@ class CommonTestCases:
                config_and_inputs = self.prepare_config_and_inputs()
                self.create_and_check_presents(*config_and_inputs)

+        @slow
        def run_slow_tests(self):
            self.create_and_check_model_from_pretrained()

@@ -741,10 +792,28 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
    for _ in range(total_dims):
        values.append(rng.randint(0, vocab_size - 1))

-    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
+
+
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = global_rng
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
+
+    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()


+@require_torch
 class ModelUtilsTest(unittest.TestCase):
+    @slow
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:

--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -16,7 +16,6 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import pytest
 import shutil
 import pdb

@@ -25,13 +24,13 @@ from transformers import is_torch_available
 if is_torch_available():
    from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
                                    CTRLLMHeadModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device


+@require_torch
 class CTRLModelTest(CommonTestCases.CommonModelTester):

    all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
@@ -140,6 +139,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = CTRLModel(config=config)
+            model.to(torch_device)
            model.eval()

            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -157,6 +157,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = CTRLLMHeadModel(config)
+            model.to(torch_device)
            model.eval()

            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -202,7 +203,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_lm_head_model(*config_and_inputs)

-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        cache_dir = "/tmp/transformers_test/"
        for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:

--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -17,20 +17,20 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import pytest

 from transformers import is_torch_available

 if is_torch_available():
    from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
+                                    DistilBertForTokenClassification,
                                    DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device


+@require_torch
 class DistilBertModelTest(CommonTestCases.CommonModelTester):

    all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
@@ -125,6 +125,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = DistilBertModel(config=config)
+            model.to(torch_device)
            model.eval()
            (sequence_output,) = model(input_ids, input_mask)
            (sequence_output,) = model(input_ids)
@@ -138,6 +139,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = DistilBertForMaskedLM(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
            result = {
@@ -151,6 +153,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = DistilBertForQuestionAnswering(config=config)
+            model.to(torch_device)
            model.eval()
            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
            result = {
@@ -169,6 +172,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = DistilBertForSequenceClassification(config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
            result = {
@@ -180,6 +184,22 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.num_labels])
            self.check_loss_output(result)

+        def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = DistilBertForTokenClassification(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.seq_length, self.num_labels])
+            self.check_loss_output(result)
+
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
@@ -209,7 +229,11 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)

-    # @pytest.mark.slow
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
+
+    # @slow
    # def test_model_from_pretrained(self):
    #     cache_dir = "/tmp/transformers_test/"
    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:

--- a/transformers/tests/modeling_encoder_decoder_test.py
+++ b/transformers/tests/modeling_encoder_decoder_test.py
+# coding=utf-8
+# Copyright 2018 The Hugging Face Inc. Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import unittest
+
+from transformers import is_torch_available
+from .utils import require_torch, slow
+
+if is_torch_available():
+    from transformers import BertModel, BertForMaskedLM, Model2Model
+    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@require_torch
+class EncoderDecoderModelTest(unittest.TestCase):
+    @slow
+    def test_model2model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = Model2Model.from_pretrained(model_name)
+            self.assertIsInstance(model.encoder, BertModel)
+            self.assertIsInstance(model.decoder, BertForMaskedLM)
+            self.assertEqual(model.decoder.config.is_decoder, True)
+            self.assertEqual(model.encoder.config.is_decoder, False)
+
+    def test_model2model_from_pretrained_not_bert(self):
+        logging.basicConfig(level=logging.INFO)
+        with self.assertRaises(ValueError):
+            _ = Model2Model.from_pretrained('roberta')
+
+        with self.assertRaises(ValueError):
+            _ = Model2Model.from_pretrained('distilbert')
+
+        with self.assertRaises(ValueError):
+            _ = Model2Model.from_pretrained('does-not-exist')
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import pytest
 import shutil

 from transformers import is_torch_available
@@ -25,13 +24,13 @@ from transformers import is_torch_available
 if is_torch_available():
    from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
                                    GPT2LMHeadModel, GPT2DoubleHeadsModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device


+@require_torch
 class GPT2ModelTest(CommonTestCases.CommonModelTester):

    all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
@@ -136,6 +135,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2Model(config=config)
+            model.to(torch_device)
            model.eval()

            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -153,6 +153,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2LMHeadModel(config)
+            model.to(torch_device)
            model.eval()

            loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -171,6 +172,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
            model = GPT2DoubleHeadsModel(config)
+            model.to(torch_device)
            model.eval()


@@ -235,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)

-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        cache_dir = "/tmp/transformers_test/"
        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:

--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import pytest
 import shutil

 from transformers import is_torch_available
@@ -25,13 +24,13 @@ from transformers import is_torch_available
 if is_torch_available():
    from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                    OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device


+@require_torch
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):

    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
@@ -124,6 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
            model = OpenAIGPTModel(config=config)
+            model.to(torch_device)
            model.eval()

            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
@@ -139,6 +139,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
            model = OpenAIGPTLMHeadModel(config)
+            model.to(torch_device)
            model.eval()

            loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
@@ -157,6 +158,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):

        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
            model = OpenAIGPTDoubleHeadsModel(config)
+            model.to(torch_device)
            model.eval()

            loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
@@ -203,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)

-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        cache_dir = "/tmp/transformers_test/"
        for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:

--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -18,21 +18,21 @@ from __future__ import print_function

 import unittest
 import shutil
-import pytest

 from transformers import is_torch_available

 if is_torch_available():
    import torch
-    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
+    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
+                              RobertaForSequenceClassification, RobertaForTokenClassification)
    from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device


+@require_torch
 class RobertaModelTest(CommonTestCases.CommonModelTester):

    all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
@@ -128,6 +128,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
                                           token_labels, choice_labels):
            model = RobertaModel(config=config)
+            model.to(torch_device)
            model.eval()
            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -145,6 +146,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
                                                   token_labels, choice_labels):
            model = RobertaForMaskedLM(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
            result = {
@@ -156,6 +158,23 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.seq_length, self.vocab_size])
            self.check_loss_output(result)

+        def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask,
+                                                              sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = RobertaForTokenClassification(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+                                 labels=token_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.seq_length, self.num_labels])
+            self.check_loss_output(result)
+
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
            (config, input_ids, token_type_ids, input_mask,
@@ -178,7 +197,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)

-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        cache_dir = "/tmp/transformers_test/"
        for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -190,10 +209,10 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):

 class RobertaModelIntegrationTest(unittest.TestCase):

-    @pytest.mark.slow
+    @slow
    def test_inference_masked_lm(self):
        model = RobertaForMaskedLM.from_pretrained('roberta-base')
-        
+
        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 11, 50265))
@@ -211,10 +230,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
        )

-    @pytest.mark.slow
+    @slow
    def test_inference_no_head(self):
        model = RobertaModel.from_pretrained('roberta-base')
-        
+
        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
        output = model(input_ids)[0]
        # compare the actual values for a slice.
@@ -227,10 +246,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
        )

-    @pytest.mark.slow
+    @slow
    def test_inference_classification_head(self):
        model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
-        
+
        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 3))

--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import sys
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
+
+from transformers import AlbertConfig, is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
+                                                 TFAlbertForSequenceClassification,
+                                                 TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+
+
+@require_tf
+class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes = (
+        TFAlbertModel,
+        TFAlbertForMaskedLM,
+        TFAlbertForSequenceClassification
+    ) if is_tf_available() else ()
+
+    class TFAlbertModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     embedding_size=16,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                     ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.embedding_size = embedding_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor(
+                [self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor(
+                    [self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor(
+                    [self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor(
+                    [self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor(
+                    [self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = AlbertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFAlbertModel(config=config)
+            # inputs = {'input_ids': input_ids,
+            #           'attention_mask': input_mask,
+            #           'token_type_ids': token_type_ids}
+            # sequence_output, pooled_output = model(**inputs)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output, pooled_output = model(inputs)
+
+            inputs = [input_ids, input_mask]
+            sequence_output, pooled_output = model(inputs)
+
+            sequence_output, pooled_output = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output.numpy(),
+                "pooled_output": pooled_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].shape), [
+                                        self.batch_size, self.hidden_size])
+
+        def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFAlbertForMaskedLM(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores, = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+        def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFAlbertForSequenceClassification(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.num_labels])
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids,
+                           'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
+        self.config_tester = ConfigTester(
+            self, config_class=AlbertConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_albert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_masked_lm(
+            *config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_sequence_classification(
+            *config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        # for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['albert-base-uncased']:
+            model = TFAlbertModel.from_pretrained(
+                model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -18,11 +18,12 @@ from __future__ import print_function

 import unittest
 import shutil
-import pytest
 import logging

 from transformers import is_tf_available

+from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
+
 if is_tf_available():
    from transformers import (AutoConfig, BertConfig,
                                      TFAutoModel, TFBertModel,
@@ -33,11 +34,11 @@ if is_tf_available():

    from .modeling_common_test import (CommonTestCases, ids_tensor)
    from .configuration_common_test import ConfigTester
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")


+@require_tf
 class TFAutoModelTest(unittest.TestCase):
+    @slow
    def test_model_from_pretrained(self):
        import h5py
        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
@@ -53,6 +54,7 @@ class TFAutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertModel)

+    @slow
    def test_lmhead_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -65,6 +67,7 @@ class TFAutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForMaskedLM)

+    @slow
    def test_sequence_classification_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -77,6 +80,7 @@ class TFAutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForSequenceClassification)

+    @slow
    def test_question_answering_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -89,6 +93,11 @@ class TFAutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, TFBertForQuestionAnswering)

+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, force_download=True)
+        self.assertIsInstance(model, TFBertForMaskedLM)
+

 if __name__ == "__main__":
    unittest.main()