Merge branch 'master' into t5

0558c9cb · thomwolf · 608a8f5b · e57d00ee · 0558c9cb · 0558c9cb
Commit 0558c9cb authored Dec 10, 2019 by thomwolf
20 changed files
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -118,6 +118,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
            new_key = key.replace('gamma', 'weight')
        if 'beta' in key:
            new_key = key.replace('beta', 'bias')
+        # DialoGPT format
+        if key == 'lm_head.decoder.weight':
+            new_key = 'lm_head.weight'
        if new_key:
            old_keys.append(key)
            new_keys.append(new_key)

--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -24,7 +24,7 @@ import numpy as np
 import tensorflow as tf
 from .configuration_roberta import RobertaConfig
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
 from .file_utils import add_start_docstrings
 from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
@@ -48,13 +48,17 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
    def _embedding(self, inputs, training=False):
        """Applies embedding based on inputs tensor."""
-        input_ids, position_ids, token_type_ids = inputs
+        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
+        if input_ids is not None:
+            seq_length = shape_list(input_ids)[1]
+        else:
+            seq_length = shape_list(inputs_embeds)[1]
-        seq_length = tf.shape(input_ids)[1]
        if position_ids is None:
            position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :]
-        return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids], training=training)
+        return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
 class TFRobertaMainLayer(TFBertMainLayer):
@@ -65,6 +69,9 @@ class TFRobertaMainLayer(TFBertMainLayer):
        super(TFRobertaMainLayer, self).__init__(config, **kwargs)
        self.embeddings = TFRobertaEmbeddings(config, name='embeddings')
+    def get_input_embeddings(self):
+        return self.embeddings
 class TFRobertaPreTrainedModel(TFPreTrainedModel):
    """ An abstract class to handle weights initialization and
@@ -157,6 +164,10 @@ ROBERTA_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """
 @add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
@@ -276,6 +287,9 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
        self.roberta = TFRobertaMainLayer(config, name="roberta")
        self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
    def call(self, inputs, **kwargs):
        outputs = self.roberta(inputs, **kwargs)

--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -337,7 +337,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
                emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
                mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
-                emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(tf.shape(emb_flat), dtype=tf.int64))
+                emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64))
            embed_shape = shape_list(inp) + [self.d_proj]
            embed = tf.reshape(emb_flat, embed_shape)
@@ -413,6 +413,9 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
                                            name='r_r_bias')
        super(TFTransfoXLMainLayer, self).build(input_shape)
+    def get_input_embeddings(self):
+        return self.word_emb
    def _resize_token_embeddings(self, new_num_tokens):
        return self.word_emb
@@ -427,11 +430,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
    def _prune_heads(self, heads):
        raise NotImplementedError
-    def init_mems(self, data):
+    def init_mems(self, bsz):
        if self.mem_len > 0:
            mems = []
            for i in range(self.n_layer):
-                empty = tf.zeros([self.mem_len, shape_list(data)[1], self.d_model])
+                empty = tf.zeros([self.mem_len, bsz, self.d_model])
                mems.append(empty)
            return mems
@@ -461,28 +464,37 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
        return new_mems
-    def call(self, inputs, mems=None, head_mask=None, training=False):
+    def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            mems = inputs[1] if len(inputs) > 1 else mems
            head_mask = inputs[2] if len(inputs) > 2 else head_mask
-            assert len(inputs) <= 3, "Too many inputs."
+            inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
+            assert len(inputs) <= 4, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            mems = inputs.get('mems', mems)
            head_mask = inputs.get('head_mask', head_mask)
-            assert len(inputs) <= 3, "Too many inputs."
+            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            assert len(inputs) <= 4, "Too many inputs."
        else:
            input_ids = inputs
        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
        # so we transpose here from shape [bsz, len] to shape [len, bsz]
-        input_ids = tf.transpose(input_ids, perm=(1, 0))
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = tf.transpose(input_ids, perm=(1, 0))
+            qlen, bsz = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
+            qlen, bsz = shape_list(inputs_embeds)[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
        if mems is None:
-            mems = self.init_mems(input_ids)
+            mems = self.init_mems(bsz)
-        qlen, bsz = shape_list(input_ids)
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
@@ -494,7 +506,10 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
        else:
            head_mask = [None] * self.n_layer
-        word_emb = self.word_emb(input_ids)
+        if inputs_embeds is not None:
+            word_emb = inputs_embeds
+        else:
+            word_emb = self.word_emb(input_ids)
        mlen = shape_list(mems[0])[0] if mems is not None else 0
        klen = mlen + qlen
@@ -626,6 +641,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """
 @add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
@@ -716,28 +735,33 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
    def reset_length(self, tgt_len, ext_len, mem_len):
        self.transformer.reset_length(tgt_len, ext_len, mem_len)
-    def init_mems(self, data):
+    def init_mems(self, bsz):
-        return self.transformer.init_mems(data)
+        return self.transformer.init_mems(bsz)
-    def call(self, inputs, mems=None, head_mask=None, labels=None, training=False):
+    def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, labels=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            mems = inputs[1] if len(inputs) > 1 else mems
            head_mask = inputs[2] if len(inputs) > 2 else head_mask
-            labels = inputs[3] if len(inputs) > 3 else labels
+            inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
-            assert len(inputs) <= 4, "Too many inputs."
+            labels = inputs[4] if len(inputs) > 4 else labels
+            assert len(inputs) <= 5, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            mems = inputs.get('mems', mems)
            head_mask = inputs.get('head_mask', head_mask)
+            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
            labels = inputs.get('labels', labels)
-            assert len(inputs) <= 4, "Too many inputs."
+            assert len(inputs) <= 5, "Too many inputs."
        else:
            input_ids = inputs
-        bsz, tgt_len = shape_list(input_ids)[:2]
+        if input_ids is not None:
+            bsz, tgt_len = shape_list(input_ids)[:2]
+        else:
+            bsz, tgt_len = shape_list(inputs_embeds)[:2]
-        transformer_outputs = self.transformer([input_ids, mems, head_mask], training=training)
+        transformer_outputs = self.transformer([input_ids, mems, head_mask, inputs_embeds], training=training)
        last_hidden = transformer_outputs[0]
        pred_hid = last_hidden[:, -tgt_len:]

--- a/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -105,7 +105,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
    @staticmethod
    def _gather_logprob(logprob, target):
-        lp_size = tf.shape(logprob)
+        lp_size = shape_list(logprob)
        r = tf.range(lp_size[0])
        idx = tf.stack([r, target], 1)
        return tf.gather_nd(logprob, idx)
@@ -159,7 +159,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                        cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
                        cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
                if target is not None:
-                    loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(tf.shape(loss), dtype=tf.int64))
+                    loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64))
            out = tf.concat(out, axis=-1)
        if target is not None:

--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -35,7 +35,7 @@ class TFPreTrainedModel(tf.keras.Model):
    r""" Base class for all TF models.
        :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
-        as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
+        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
        Class attributes (overridden by derived classes):
            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
@@ -51,7 +51,15 @@ class TFPreTrainedModel(tf.keras.Model):
    config_class = None
    pretrained_model_archive_map = {}
    base_model_prefix = ""
-    dummy_inputs = tf.constant(DUMMY_INPUTS)  # dummy inputs to build the network
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to build the network.
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        return tf.constant(DUMMY_INPUTS)
    def __init__(self, config, *inputs, **kwargs):
        super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
@@ -65,6 +73,21 @@ class TFPreTrainedModel(tf.keras.Model):
        # Save config in model
        self.config = config
+    def get_input_embeddings(self):
+        """ Get model's input embeddings
+        """
+        base_model = getattr(self, self.base_model_prefix, self)
+        if base_model is not self:
+            return base_model.get_input_embeddings()
+        else:
+            raise NotImplementedError
+    def get_output_embeddings(self):
+        """ Get model's output embeddings
+            Return None if the model doesn't have output embeddings
+        """
+        return None  # Overwrite for models with output embeddings
    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
        """ Build a resized Embedding Variable from a provided token Embedding Module.
            Increasing the size will add newly initialized vectors at the end
@@ -176,6 +199,9 @@ class TFPreTrainedModel(tf.keras.Model):
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
@@ -201,6 +227,7 @@ class TFPreTrainedModel(tf.keras.Model):
        cache_dir = kwargs.pop('cache_dir', None)
        from_pt = kwargs.pop('from_pt', False)
        force_download = kwargs.pop('force_download', False)
+        resume_download = kwargs.pop('resume_download', False)
        proxies = kwargs.pop('proxies', None)
        # Load config
@@ -209,6 +236,7 @@ class TFPreTrainedModel(tf.keras.Model):
                pretrained_model_name_or_path, *model_args,
                cache_dir=cache_dir, return_unused_kwargs=True,
                force_download=force_download,
+                resume_download=resume_download,
                **kwargs
            )
        else:
@@ -236,7 +264,8 @@ class TFPreTrainedModel(tf.keras.Model):
            # redirect to the cache, if necessary
            try:
-                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
+                                                    resume_download=resume_download, proxies=proxies)
            except EnvironmentError as e:
                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
                    logger.error(
@@ -439,7 +468,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
        elif self.summary_type == 'first':
            output = hidden_states[:, 0]
        elif self.summary_type == 'mean':
-            output = tf.mean(hidden_states, axis=1)
+            output = tf.reduce_mean(hidden_states, axis=1)
        elif self.summary_type == 'cls_index':
            hidden_shape = shape_list(hidden_states)  # e.g. [batch, num choices, seq length, hidden dims]
            if cls_index is None:
@@ -477,10 +506,10 @@ def shape_list(x):
    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
 def get_initializer(initializer_range=0.02):
-  """Creates a `tf.initializers.truncated_normal` with the given range.
+    """Creates a `tf.initializers.truncated_normal` with the given range.
-  Args:
+    Args:
-    initializer_range: float, initializer range for stddev.
+        initializer_range: float, initializer range for stddev.
-  Returns:
+    Returns:
-    TruncatedNormal initializer with stddev = `initializer_range`.
+        TruncatedNormal initializer with stddev = `initializer_range`.
-  """
+    """
-  return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
+    return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -277,6 +277,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
                    self.prune_heads({int(layer): list(map(int, heads))})
+    def get_input_embeddings(self):
+        return self.embeddings
    def _resize_token_embeddings(self, new_num_tokens):
        raise NotImplementedError
@@ -288,7 +291,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
        raise NotImplementedError
    def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None,
-             position_ids=None, lengths=None, cache=None, head_mask=None,
+             position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None,
             training=False):  # removed: src_enc=None, src_len=None
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -299,7 +302,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
            lengths = inputs[5] if len(inputs) > 5 else lengths
            cache = inputs[6] if len(inputs) > 6 else cache
            head_mask = inputs[7] if len(inputs) > 7 else head_mask
-            assert len(inputs) <= 8, "Too many inputs."
+            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
+            assert len(inputs) <= 9, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            attention_mask = inputs.get('attention_mask', attention_mask)
@@ -309,16 +313,28 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
            lengths = inputs.get('lengths', lengths)
            cache = inputs.get('cache', cache)
            head_mask = inputs.get('head_mask', head_mask)
-            assert len(inputs) <= 8, "Too many inputs."
+            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            assert len(inputs) <= 9, "Too many inputs."
        else:
            input_ids = inputs
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            bs, slen = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            bs, slen = shape_list(inputs_embeds)[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
        if lengths is None:
-            lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1)
+            if input_ids is not None:
+                lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1)
+            else:
+                lengths = tf.convert_to_tensor([slen]*bs, tf.int32)
        # mask = input_ids != self.pad_index
        # check inputs
-        bs, slen = shape_list(input_ids)
        # assert shape_list(lengths)[0] == bs
        tf.debugging.assert_equal(shape_list(lengths)[0], bs)
        # assert lengths.max().item() <= slen
@@ -358,7 +374,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
            head_mask = [None] * self.n_layers
        # do not recompute cached elements
-        if cache is not None:
+        if cache is not None and input_ids is not None:
            _slen = slen - cache['slen']
            input_ids = input_ids[:, -_slen:]
            position_ids = position_ids[:, -_slen:]
@@ -368,8 +384,10 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
            attn_mask = attn_mask[:, -_slen:]
        # embeddings
-        tensor = self.embeddings(input_ids)
+        if inputs_embeds is None:
-        tensor = tensor + self.position_embeddings(position_ids)
+            inputs_embeds = self.embeddings(input_ids)
+        tensor = inputs_embeds + self.position_embeddings(position_ids)
        if langs is not None and self.use_lang_emb:
            tensor = tensor + self.lang_embeddings(langs)
        if token_type_ids is not None:
@@ -530,6 +548,10 @@ XLM_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """
 @add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
@@ -637,6 +659,8 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
        self.transformer = TFXLMMainLayer(config, name='transformer')
        self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj')
+    def get_output_embeddings(self):
+        return self.pred_layer.input_embeddings
    def call(self, inputs, **kwargs):
        transformer_outputs = self.transformer(inputs, **kwargs)

--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -112,8 +112,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
    def prune_heads(self, heads):
        raise NotImplementedError
-    @staticmethod
+    def rel_shift(self, x, klen=-1):
-    def rel_shift(x, klen=-1):
        """perform relative shift to form the relative attention score."""
        x_size = shape_list(x)
@@ -135,7 +134,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
        # position based attention score
        bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
-        bd = self.rel_shift(bd, klen=ac.shape[1])
+        bd = self.rel_shift(bd, klen=shape_list(ac)[1])
        # segment based attention score
        if seg_mat is None:
@@ -192,7 +191,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
        if g is not None:
            ###### Two-stream attention with relative positional encoding.
            # content based attention score
-            if mems is not None and mems.shape.ndims > 1:
+            if mems is not None and len(shape_list(mems)) > 1:
                cat = tf.concat([mems, h], axis=0)
            else:
                cat = h
@@ -252,7 +251,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
        else:
            ###### Multi-head attention with relative positional encoding
-            if mems is not None and mems.shape.ndims > 1:
+            if mems is not None and len(shape_list(mems)) > 1:
                cat = tf.concat([mems, h], axis=0)
            else:
                cat = h
@@ -371,6 +370,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
        self.dropout = tf.keras.layers.Dropout(config.dropout)
+    def get_input_embeddings(self):
+        return self.word_embedding
    def build(self, input_shape):
        initializer = get_initializer(self.initializer_range)
        self.mask_emb = self.add_weight(shape=(1, 1, self.d_model),
@@ -484,7 +486,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        return pos_emb
    def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-            token_type_ids=None, input_mask=None, head_mask=None, training=False):
+            token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, training=False):
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
@@ -494,7 +496,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
            token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids
            input_mask = inputs[6] if len(inputs) > 6 else input_mask
            head_mask = inputs[7] if len(inputs) > 7 else head_mask
-            assert len(inputs) <= 8, "Too many inputs."
+            inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
+            assert len(inputs) <= 9, "Too many inputs."
        elif isinstance(inputs, dict):
            input_ids = inputs.get('input_ids')
            attention_mask = inputs.get('attention_mask', attention_mask)
@@ -504,7 +507,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
            token_type_ids = inputs.get('token_type_ids', token_type_ids)
            input_mask = inputs.get('input_mask', input_mask)
            head_mask = inputs.get('head_mask', head_mask)
-            assert len(inputs) <= 8, "Too many inputs."
+            inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
+            assert len(inputs) <= 9, "Too many inputs."
        else:
            input_ids = inputs
@@ -512,14 +516,23 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        # but we want a unified interface in the library with the batch size on the first dimension
        # so we move here the first dimension (batch) to the end
-        input_ids = tf.transpose(input_ids, perm=(1, 0))
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = tf.transpose(input_ids, perm=(1, 0))
+            qlen, bsz = shape_list(input_ids)[:2]
+        elif inputs_embeds is not None:
+            inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
+            qlen, bsz = shape_list(inputs_embeds)[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
        token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None
        input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None
        attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None
        perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None
        target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None
-        qlen, bsz = shape_list(input_ids)[:2]
        mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0
        klen = mlen + qlen
@@ -551,7 +564,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        if data_mask is not None:
            # all mems can be attended to
-            mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz],
+            mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz],
                                dtype=dtype_float)
            data_mask = tf.concat([mems_mask, data_mask], axis=1)
            if attn_mask is None:
@@ -570,10 +583,13 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
            non_tgt_mask = None
        ##### Word embeddings and prepare h & g hidden states
-        word_emb_k = self.word_embedding(input_ids)
+        if inputs_embeds is not None:
+            word_emb_k = inputs_embeds
+        else:
+            word_emb_k = self.word_embedding(input_ids)
        output_h = self.dropout(word_emb_k, training=training)
        if target_mapping is not None:
-            word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1])
+            word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1])
        # else:  # We removed the inp_q input which was same as target mapping
        #     inp_q_ext = inp_q[:, :, None]
        #     word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
@@ -762,6 +778,10 @@ XLNET_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """
 @add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
@@ -850,6 +870,9 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
        self.transformer = TFXLNetMainLayer(config, name='transformer')
        self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss')
+    def get_output_embeddings(self):
+        return self.lm_loss.input_embeddings
    def call(self, inputs, **kwargs):
        transformer_outputs = self.transformer(inputs, **kwargs)
        hidden_state = transformer_outputs[0]
@@ -915,6 +938,59 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
        return outputs  # return logits, (mems), (hidden states), (attentions)
+@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **mems**: (`optional`, returned when ``config.mem_len > 0``)
+            list of ``tf.Tensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        import tensorflow as tf
+        from transformers import XLNetTokenizer, TFXLNetForTokenClassification
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.transformer = TFXLNetMainLayer(config, name='transformer')
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+    def call(self, inputs, **kwargs):
+        transformer_outputs = self.transformer(inputs, **kwargs)
+        output = transformer_outputs[0]
+        logits = self.classifier(output)
+        outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        return outputs  # return logits, (mems), (hidden states), (attentions)
 # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
 #     the hidden-states output to compute `span start logits` and `span end logits`). """,
 #     XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)

--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -553,6 +553,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """
 @add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
@@ -657,12 +661,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        logger.info("Head pruning is not implemented for Transformer-XL model")
        pass
-    def init_mems(self, data):
+    def init_mems(self, bsz):
        if self.mem_len > 0:
            mems = []
            param = next(self.parameters())
            for i in range(self.n_layer):
-                empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model,
+                empty = torch.zeros(self.mem_len, bsz, self.config.d_model,
                                    dtype=param.dtype, device=param.device)
                mems.append(empty)
@@ -693,15 +697,22 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        return new_mems
-    def forward(self, input_ids, mems=None, head_mask=None):
+    def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None):
        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
        # so we transpose here from shape [bsz, len] to shape [len, bsz]
-        input_ids = input_ids.transpose(0, 1).contiguous()
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = input_ids.transpose(0, 1).contiguous()
+            qlen, bsz = input_ids.size()
+        elif inputs_embeds is not None:
+            inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
+            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
        if mems is None:
-            mems = self.init_mems(input_ids)
+            mems = self.init_mems(bsz)
-        qlen, bsz = input_ids.size()
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
@@ -718,7 +729,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        else:
            head_mask = [None] * self.n_layer
-        word_emb = self.word_emb(input_ids)
+        if inputs_embeds is not None:
+            word_emb = inputs_embeds
+        else:
+            word_emb = self.word_emb(input_ids)
        mlen = mems[0].size(0) if mems is not None else 0
        klen = mlen + qlen
@@ -860,14 +874,18 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
    def reset_length(self, tgt_len, ext_len, mem_len):
        self.transformer.reset_length(tgt_len, ext_len, mem_len)
-    def init_mems(self, data):
+    def init_mems(self, bsz):
-        return self.transformer.init_mems(data)
+        return self.transformer.init_mems(bsz)
-    def forward(self, input_ids, mems=None, head_mask=None, labels=None):
+    def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None, labels=None):
-        bsz = input_ids.size(0)
+        if input_ids is not None:
-        tgt_len = input_ids.size(1)
+            bsz, tgt_len = input_ids.size(0), input_ids.size(1)
+        elif inputs_embeds is not None:
+            bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1)
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
-        transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask)
+        transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask, inputs_embeds=inputs_embeds)
        last_hidden = transformer_outputs[0]
        pred_hid = last_hidden[:, -tgt_len:]

--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -53,7 +53,7 @@ class PreTrainedModel(nn.Module):
    r""" Base class for all models.
        :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
-        as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
+        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
        Class attributes (overridden by derived classes):
            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
@@ -237,7 +237,7 @@ class PreTrainedModel(nn.Module):
        """
        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
-        # Only save the model it-self if we are using distributed training
+        # Only save the model itself if we are using distributed training
        model_to_save = self.module if hasattr(self, 'module') else self
        # Save configuration file
@@ -290,6 +290,9 @@ class PreTrainedModel(nn.Module):
            force_download: (`optional`) boolean, default False:
                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
            proxies: (`optional`) dict, default None:
                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
                The proxies are used on each request.
@@ -314,11 +317,16 @@ class PreTrainedModel(nn.Module):
            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
+        if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
+            logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
+                           "https://github.com/google-research/google-research/issues/119 for more information.")
        config = kwargs.pop('config', None)
        state_dict = kwargs.pop('state_dict', None)
        cache_dir = kwargs.pop('cache_dir', None)
        from_tf = kwargs.pop('from_tf', False)
        force_download = kwargs.pop('force_download', False)
+        resume_download = kwargs.pop('resume_download', False)
        proxies = kwargs.pop('proxies', None)
        output_loading_info = kwargs.pop('output_loading_info', False)
@@ -328,6 +336,7 @@ class PreTrainedModel(nn.Module):
                pretrained_model_name_or_path, *model_args,
                cache_dir=cache_dir, return_unused_kwargs=True,
                force_download=force_download,
+                resume_download=resume_download,
                proxies=proxies,
                **kwargs
            )
@@ -360,7 +369,8 @@ class PreTrainedModel(nn.Module):
            # redirect to the cache, if necessary
            try:
-                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
+                                                    proxies=proxies, resume_download=resume_download)
            except EnvironmentError:
                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
                    msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
@@ -416,6 +426,8 @@ class PreTrainedModel(nn.Module):
                    new_key = key.replace('gamma', 'weight')
                if 'beta' in key:
                    new_key = key.replace('beta', 'bias')
+                if key == 'lm_head.decoder.weight':
+                    new_key = 'lm_head.weight'
                if new_key:
                    old_keys.append(key)
                    new_keys.append(new_key)
@@ -726,7 +738,7 @@ class SequenceSummary(nn.Module):
    def __init__(self, config):
        super(SequenceSummary, self).__init__()
-        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
+        self.summary_type = config.summary_type if hasattr(config, 'summary_type') else 'last'
        if self.summary_type == 'attn':
            # We should use a standard multi-head attention module with absolute positional embedding for that.
            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276

--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -311,6 +311,10 @@ XLM_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """
 @add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
@@ -421,14 +425,21 @@ class XLMModel(XLMPreTrainedModel):
        for layer, heads in heads_to_prune.items():
            self.attentions[layer].prune_heads(heads)
-    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None):  # removed: src_enc=None, src_len=None
+                lengths=None, cache=None, head_mask=None, inputs_embeds=None):  # removed: src_enc=None, src_len=None
+        if input_ids is not None:
+            bs, slen = input_ids.size()
+        else:
+            bs, slen = inputs_embeds.size()[:-1]
        if lengths is None:
-            lengths = (input_ids != self.pad_index).sum(dim=1).long()
+            if input_ids is not None:
+                lengths = (input_ids != self.pad_index).sum(dim=1).long()
+            else:
+                lengths = torch.LongTensor([slen]*bs)
        # mask = input_ids != self.pad_index
        # check inputs
-        bs, slen = input_ids.size()
        assert lengths.size(0) == bs
        assert lengths.max().item() <= slen
        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
@@ -442,10 +453,12 @@ class XLMModel(XLMPreTrainedModel):
        # if self.is_decoder and src_enc is not None:
        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
        # position_ids
        if position_ids is None:
-            position_ids = input_ids.new((slen,)).long()
+            position_ids = torch.arange(slen, dtype=torch.long, device=device)
-            position_ids = torch.arange(slen, out=position_ids).unsqueeze(0)
+            position_ids = position_ids.unsqueeze(0).expand((bs, slen))
        else:
            assert position_ids.size() == (bs, slen)  # (slen, bs)
            # position_ids = position_ids.transpose(0, 1)
@@ -471,7 +484,7 @@ class XLMModel(XLMPreTrainedModel):
            head_mask = [None] * self.n_layers
        # do not recompute cached elements
-        if cache is not None:
+        if cache is not None and input_ids is not None:
            _slen = slen - cache['slen']
            input_ids = input_ids[:, -_slen:]
            position_ids = position_ids[:, -_slen:]
@@ -481,8 +494,10 @@ class XLMModel(XLMPreTrainedModel):
            attn_mask = attn_mask[:, -_slen:]
        # embeddings
-        tensor = self.embeddings(input_ids)
+        if inputs_embeds is None:
-        tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
+            inputs_embeds = self.embeddings(input_ids)
+        tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
        if langs is not None and self.use_lang_emb:
            tensor = tensor + self.lang_embeddings(langs)
        if token_type_ids is not None:
@@ -624,8 +639,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
    def get_output_embeddings(self):
        return self.pred_layer.proj
-    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, labels=None):
+                lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               langs=langs,
@@ -633,7 +648,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
                                               position_ids=position_ids,
                                               lengths=lengths, 
                                               cache=cache,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)
        output = transformer_outputs[0]
        outputs = self.pred_layer(output, labels)
@@ -685,8 +701,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, labels=None):
+                lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               langs=langs,
@@ -694,7 +710,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
                                               position_ids=position_ids,
                                               lengths=lengths, 
                                               cache=cache,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)
        output = transformer_outputs[0]
        logits = self.sequence_summary(output)
@@ -768,8 +785,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None):
+                lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               langs=langs,
@@ -777,7 +794,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
                                               position_ids=position_ids,
                                               lengths=lengths, 
                                               cache=cache,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)
        sequence_output = transformer_outputs[0]
@@ -863,8 +881,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
-                lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None,
+                lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None,
                is_impossible=None, cls_index=None, p_mask=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
@@ -873,7 +891,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
                                               position_ids=position_ids,
                                               lengths=lengths, 
                                               cache=cache,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)
        output = transformer_outputs[0]

--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -558,6 +558,10 @@ XLNET_INPUTS_DOCSTRING = r"""
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
 """
 @add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
@@ -579,6 +583,7 @@ class XLNetModel(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
    Examples::
@@ -712,19 +717,29 @@ class XLNetModel(XLNetPreTrainedModel):
        pos_emb = pos_emb.to(next(self.parameters()))
        return pos_emb
-    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None):
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None):
        # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
        # but we want a unified interface in the library with the batch size on the first dimension
        # so we move here the first dimension (batch) to the end
-        input_ids = input_ids.transpose(0, 1).contiguous()
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = input_ids.transpose(0, 1).contiguous()
+            qlen, bsz = input_ids.shape[0], input_ids.shape[1]
+        elif inputs_embeds is not None:
+            inputs_embeds.transpose(0, 1).contiguous()
+            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
        token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
        input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
        attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
        perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
        target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
-        qlen, bsz = input_ids.shape[0], input_ids.shape[1]
        mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
        klen = mlen + qlen
@@ -777,7 +792,10 @@ class XLNetModel(XLNetPreTrainedModel):
            non_tgt_mask = None
        ##### Word embeddings and prepare h & g hidden states
-        word_emb_k = self.word_embedding(input_ids)
+        if inputs_embeds is not None:
+            word_emb_k = inputs_embeds
+        else:
+            word_emb_k = self.word_embedding(input_ids)
        output_h = self.dropout(word_emb_k)
        if target_mapping is not None:
            word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
@@ -861,7 +879,11 @@ class XLNetModel(XLNetPreTrainedModel):
                hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
            outputs = outputs + (hidden_states,)
        if self.output_attentions:
-            attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
+            if target_mapping is not None:
+                # when target_mapping is provided, there are 2-tuple of attentions
+                attentions = tuple(tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions)
+            else:
+                attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
            outputs = outputs + (attentions,)
        return outputs  # outputs, (new_mems), (hidden_states), (attentions)
@@ -896,6 +918,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
    Examples::
@@ -924,8 +947,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
    def get_output_embeddings(self):
        return self.lm_loss
-    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, labels=None):
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               mems=mems,
@@ -933,7 +956,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
                                               target_mapping=target_mapping,
                                               token_type_ids=token_type_ids,
                                               input_mask=input_mask,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)
        logits = self.lm_loss(transformer_outputs[0])
@@ -977,6 +1001,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
    Examples::
@@ -998,8 +1023,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None, labels=None):
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
                                               mems=mems,
@@ -1007,7 +1032,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                                               target_mapping=target_mapping,
                                               token_type_ids=token_type_ids,
                                               input_mask=input_mask,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)
        output = transformer_outputs[0]
        output = self.sequence_summary(output)
@@ -1027,6 +1053,106 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
+@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
+                      the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+                      XLNET_START_DOCSTRING,
+                      XLNET_INPUTS_DOCSTRING)
+class XLNetForTokenClassification(XLNetPreTrainedModel):
+    r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The second dimension of the input (`num_choices`) indicates the number of choices to scores.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **mems**: (`optional`, returned when ``config.mem_len > 0``)
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        scores = outputs[0]
+    """
+    def __init__(self, config):
+        super(XLNetForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = XLNetModel(config)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
+        outputs = self.transformer(input_ids,
+                            attention_mask=attention_mask,
+                            mems=mems,
+                            perm_mask=perm_mask,
+                            target_mapping=target_mapping,
+                            token_type_ids=token_type_ids,
+                            input_mask=input_mask,
+                            head_mask=head_mask,
+                            inputs_embeds=inputs_embeds)
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+        outputs = (logits,) + outputs[1:]  # Keep mems, hidden states, attentions if there are in it
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+        return outputs  # return (loss), logits, (mems), (hidden states), (attentions)
 @add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
    the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
@@ -1049,6 +1175,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
            Mask to nullify selected heads of the self-attention modules.
            Mask values selected in ``[0, 1]``:
            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
@@ -1072,6 +1202,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
    Examples::
@@ -1093,9 +1224,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
+    def forward(self, input_ids=None, token_type_ids=None, input_mask=None, attention_mask=None,
                mems=None, perm_mask=None, target_mapping=None,
-                labels=None, head_mask=None):
+                labels=None, head_mask=None, inputs_embeds=None):
        num_choices = input_ids.shape[1]
        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -1106,7 +1237,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
        transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids,
                                               input_mask=flat_input_mask, attention_mask=flat_attention_mask,
                                               mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask, inputs_embeds=inputs_embeds)
        output = transformer_outputs[0]
@@ -1157,6 +1288,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
    Examples::
@@ -1178,8 +1310,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None,
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
                start_positions=None, end_positions=None):
        outputs = self.transformer(input_ids,
@@ -1189,7 +1321,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
                                    target_mapping=target_mapping,
                                    token_type_ids=token_type_ids,
                                    input_mask=input_mask,
-                                    head_mask=head_mask)
+                                    head_mask=head_mask,
+                                    inputs_embeds=inputs_embeds)
        sequence_output = outputs[0]
@@ -1270,6 +1403,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+            When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
    Examples::
@@ -1294,8 +1428,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
        self.init_weights()
-    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
-                token_type_ids=None, input_mask=None, head_mask=None,
+                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
                start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
        transformer_outputs = self.transformer(input_ids,
                                               attention_mask=attention_mask,
@@ -1304,7 +1438,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
                                               target_mapping=target_mapping,
                                               token_type_ids=token_type_ids,
                                               input_mask=input_mask,
-                                               head_mask=head_mask)
+                                               head_mask=head_mask,
+                                               inputs_embeds=inputs_embeds)
        hidden_states = transformer_outputs[0]
        start_logits = self.start_logits(hidden_states, p_mask=p_mask)

--- a/transformers/optimization.py
+++ b/transformers/optimization.py
@@ -23,86 +23,66 @@ from torch.optim.lr_scheduler import LambdaLR
 logger = logging.getLogger(__name__)
-class ConstantLRSchedule(LambdaLR):
-    """ Constant learning rate schedule.
+def get_constant_schedule(optimizer, last_epoch=-1):
+    """ Create a schedule with a constant learning rate.
    """
-    def __init__(self, optimizer, last_epoch=-1):
+    return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
-        super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
-class WarmupConstantSchedule(LambdaLR):
+def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
-    """ Linear warmup and then constant.
+    """ Create a schedule with a constant learning rate preceded by a warmup
-        Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
+    period during which the learning rate increases linearly between 0 and 1.
-        Keeps learning rate schedule equal to 1. after warmup_steps.
    """
-    def __init__(self, optimizer, warmup_steps, last_epoch=-1):
+    def lr_lambda(current_step):
-        self.warmup_steps = warmup_steps
+        if current_step < num_warmup_steps:
-        super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
+            return float(current_step) / float(max(1.0, num_warmup_steps))
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1.0, self.warmup_steps))
        return 1.
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
-class WarmupLinearSchedule(LambdaLR):
-    """ Linear warmup and then linear decay.
+def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
-        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
+    """ Create a schedule with a learning rate that decreases linearly after
-        Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
+    linearly increasing during a warmup period.
-    """
-    def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
-        self.warmup_steps = warmup_steps
-        self.t_total = t_total
-        super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1, self.warmup_steps))
-        return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
-class WarmupCosineSchedule(LambdaLR):
-    """ Linear warmup and then cosine decay.
-        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
-        Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
-        If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
    """
-    def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
+    def lr_lambda(current_step):
-        self.warmup_steps = warmup_steps
+        if current_step < num_warmup_steps:
-        self.t_total = t_total
+            return float(current_step) / float(max(1, num_warmup_steps))
-        self.cycles = cycles
+        return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
-        super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
-    def lr_lambda(self, step):
-        if step < self.warmup_steps:
-            return float(step) / float(max(1.0, self.warmup_steps))
+def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
-        # progress after warmup
+    """ Create a schedule with a learning rate that decreases following the
-        progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
+    values of the cosine function between 0 and `pi * cycles` after a warmup
-        return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+    period during which it increases linearly between 0 and 1.
-class WarmupCosineWithHardRestartsSchedule(LambdaLR):
-    """ Linear warmup and then cosine cycles with hard restarts.
-        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
-        If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
-        learning rate (with hard restarts).
    """
-    def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
+    def lr_lambda(current_step):
-        self.warmup_steps = warmup_steps
+        if current_step < num_warmup_steps:
-        self.t_total = t_total
+            return float(current_step) / float(max(1, num_warmup_steps))
-        self.cycles = cycles
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
-        super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
+        return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
-    def lr_lambda(self, step):
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
-        if step < self.warmup_steps:
-            return float(step) / float(max(1, self.warmup_steps))
-        # progress after warmup
-        progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
-        if progress >= 1.0:
-            return 0.0
-        return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
+def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1):
+    """ Create a schedule with a learning rate that decreases following the
+    values of the cosine function with several hard restarts, after a warmup
+    period during which it increases linearly between 0 and 1.
+    """
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        if progress >= 1.:
+            return 0.
+        return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.))))
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
 class AdamW(Optimizer):
    """ Implements Adam algorithm with weight decay fix.

--- a/transformers/optimization_tf.py
+++ b/transformers/optimization_tf.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions and classes related to optimization (weight updates)."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import re
+import tensorflow as tf
+class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applys a warmup schedule on a given learning rate decay schedule."""
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_schedule_fn,
+      warmup_steps,
+      power=1.0,
+      name=None):
+    super(WarmUp, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.warmup_steps = warmup_steps
+    self.power = power
+    self.decay_schedule_fn = decay_schedule_fn
+    self.name = name
+  def __call__(self, step):
+    with tf.name_scope(self.name or 'WarmUp') as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+      warmup_percent_done = global_step_float / warmup_steps_float
+      warmup_learning_rate = (
+          self.initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self.power))
+      return tf.cond(global_step_float < warmup_steps_float,
+                     lambda: warmup_learning_rate,
+                     lambda: self.decay_schedule_fn(step),
+                     name=name)
+  def get_config(self):
+    return {
+        'initial_learning_rate': self.initial_learning_rate,
+        'decay_schedule_fn': self.decay_schedule_fn,
+        'warmup_steps': self.warmup_steps,
+        'power': self.power,
+        'name': self.name
+    }
+def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
+  """Creates an optimizer with learning rate schedule."""
+  # Implements linear decay of the learning rate.
+  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+      initial_learning_rate=init_lr,
+      decay_steps=num_train_steps,
+      end_learning_rate=0.0)
+  if num_warmup_steps:
+    learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
+                              decay_schedule_fn=learning_rate_fn,
+                              warmup_steps=num_warmup_steps)
+  optimizer = AdamWeightDecay(
+      learning_rate=learning_rate_fn,
+      weight_decay_rate=0.01,
+      beta_1=0.9,
+      beta_2=0.999,
+      epsilon=1e-6,
+      exclude_from_weight_decay=['layer_norm', 'bias'])
+  return optimizer
+class AdamWeightDecay(tf.keras.optimizers.Adam):
+  """Adam enables L2 weight decay and clip_by_global_norm on gradients.
+  Just adding the square of the weights to the loss function is *not* the
+  correct way of using L2 regularization/weight decay with Adam, since that will
+  interact with the m and v parameters in strange ways.
+  Instead we want ot decay the weights in a manner that doesn't interact with
+  the m/v parameters. This is equivalent to adding the square of the weights to
+  the loss with plain (non-momentum) SGD.
+  """
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               amsgrad=False,
+               weight_decay_rate=0.0,
+               include_in_weight_decay=None,
+               exclude_from_weight_decay=None,
+               name='AdamWeightDecay',
+               **kwargs):
+    super(AdamWeightDecay, self).__init__(
+        learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
+    self.weight_decay_rate = weight_decay_rate
+    self._include_in_weight_decay = include_in_weight_decay
+    self._exclude_from_weight_decay = exclude_from_weight_decay
+  @classmethod
+  def from_config(cls, config):
+    """Creates an optimizer from its config with WarmUp custom object."""
+    custom_objects = {'WarmUp': WarmUp}
+    return super(AdamWeightDecay, cls).from_config(
+        config, custom_objects=custom_objects)
+  def _prepare_local(self, var_device, var_dtype, apply_state):
+    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
+                                                apply_state)
+    apply_state['weight_decay_rate'] = tf.constant(
+        self.weight_decay_rate, name='adam_weight_decay_rate')
+  def _decay_weights_op(self, var, learning_rate, apply_state):
+    do_decay = self._do_use_weight_decay(var.name)
+    if do_decay:
+      return var.assign_sub(
+          learning_rate * var *
+          apply_state['weight_decay_rate'],
+          use_locking=self._use_locking)
+    return tf.no_op()
+  def apply_gradients(self, grads_and_vars, clip_norm, name=None):
+    grads, tvars = list(zip(*grads_and_vars))
+    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
+    return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
+  def _get_lr(self, var_device, var_dtype, apply_state):
+    """Retrieves the learning rate with the given state."""
+    if apply_state is None:
+      return self._decayed_lr_t[var_dtype], {}
+    apply_state = apply_state or {}
+    coefficients = apply_state.get((var_device, var_dtype))
+    if coefficients is None:
+      coefficients = self._fallback_apply_state(var_device, var_dtype)
+      apply_state[(var_device, var_dtype)] = coefficients
+    return coefficients['lr_t'], dict(apply_state=apply_state)
+  def _resource_apply_dense(self, grad, var, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      return super(AdamWeightDecay, self)._resource_apply_dense(
+          grad, var, **kwargs)
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      return super(AdamWeightDecay, self)._resource_apply_sparse(
+          grad, var, indices, **kwargs)
+  def get_config(self):
+    config = super(AdamWeightDecay, self).get_config()
+    config.update({
+        'weight_decay_rate': self.weight_decay_rate,
+    })
+    return config
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if self.weight_decay_rate == 0:
+      return False
+    if self._include_in_weight_decay:
+      for r in self._include_in_weight_decay:
+        if re.search(r, param_name) is not None:
+          return True
+    if self._exclude_from_weight_decay:
+      for r in self._exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
+class GradientAccumulator(object):
+    """Distribution strategies-aware gradient accumulation utility."""
+    def __init__(self):
+        """Initializes the accumulator."""
+        self._gradients = []
+        self._accum_steps = tf.Variable(
+            initial_value=0,
+            dtype=tf.int64,
+            trainable=False,
+            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+    @property
+    def step(self):
+        """Number of accumulated steps."""
+        return self._accum_steps.value()
+    @property
+    def gradients(self):
+        """The accumulated gradients."""
+        return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
+    def __call__(self, gradients):
+        """Accumulates :obj:`gradients`."""
+        if not self._gradients:
+            self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
+        if len(gradients) != len(self._gradients):
+            raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
+        for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
+            if accum_gradient is not None:
+                accum_gradient.assign_add(gradient)
+        self._accum_steps.assign_add(1)
+    def reset(self):
+        """Resets the accumulated gradients."""
+        if self._gradients:
+            self._accum_steps.assign(0)
+        for gradient in self._get_replica_gradients():
+            if gradient is not None:
+                gradient.assign(tf.zeros_like(gradient))
+    def _get_replica_gradients(self):
+        if tf.distribute.has_strategy():
+            # In a replica context, we want to accumulate gradients on each replica
+            # without synchronization, so we directly assign the value of the
+            # current replica.
+            replica_context = tf.distribute.get_replica_context()
+            if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
+                return self._gradients
+            return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
+        else:
+            return self._gradients
--- a/transformers/tests/conftest.py
+++ b/transformers/tests/conftest.py
-# content of conftest.py
-import pytest
-def pytest_addoption(parser):
-    parser.addoption(
-        "--runslow", action="store_true", default=False, help="run slow tests"
-    )
-def pytest_collection_modifyitems(config, items):
-    if config.getoption("--runslow"):
-        # --runslow given in cli: do not skip slow tests
-        return
-    skip_slow = pytest.mark.skip(reason="need --runslow option to run")
-    for item in items:
-        if "slow" in item.keywords:
-            item.add_marker(skip_slow)
--- a/transformers/tests/fixtures/spiece.model
+++ b/transformers/tests/fixtures/spiece.model
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+import os
+import six
+import time
+import unittest
+from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
+USER = "__DUMMY_TRANSFORMERS_USER__"
+PASS = "__DUMMY_TRANSFORMERS_PASS__"
+FILE_KEY = "Test-{}.txt".format(int(time.time()))
+FILE_PATH = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
+)
+class HfApiCommonTest(unittest.TestCase):
+    _api = HfApi(endpoint="https://moon-staging.huggingface.co")
+class HfApiLoginTest(HfApiCommonTest):
+    def test_login_invalid(self):
+        with self.assertRaises(HTTPError):
+            self._api.login(username=USER, password="fake")
+    def test_login_valid(self):
+        token = self._api.login(username=USER, password=PASS)
+        self.assertIsInstance(token, six.string_types)
+class HfApiEndpointsTest(HfApiCommonTest):
+    @classmethod
+    def setUpClass(cls):
+        """
+        Share this valid token in all tests below.
+        """
+        cls._token = cls._api.login(username=USER, password=PASS)
+    def test_whoami(self):
+        user = self._api.whoami(token=self._token)
+        self.assertEqual(user, USER)
+    def test_presign(self):
+        urls = self._api.presign(token=self._token, filename=FILE_KEY)
+        self.assertIsInstance(urls, PresignedUrl)
+        self.assertEqual(urls.type, "text/plain")
+    def test_presign_and_upload(self):
+        access_url = self._api.presign_and_upload(
+            token=self._token, filename=FILE_KEY, filepath=FILE_PATH
+        )
+        self.assertIsInstance(access_url, six.string_types)
+    def test_list_objs(self):
+        objs = self._api.list_objs(token=self._token)
+        self.assertIsInstance(objs, list)
+        if len(objs) > 0:
+            o = objs[-1]
+            self.assertIsInstance(o, S3Obj)
+class HfFolderTest(unittest.TestCase):
+    def test_token_workflow(self):
+        """
+        Test the whole token save/get/delete workflow,
+        with the desired behavior with respect to non-existent tokens.
+        """
+        token = "token-{}".format(int(time.time()))
+        HfFolder.save_token(token)
+        self.assertEqual(
+            HfFolder.get_token(),
+            token
+        )
+        HfFolder.delete_token()
+        HfFolder.delete_token()
+        # ^^ not an error, we test that the
+        # second call does not fail.
+        self.assertEqual(
+            HfFolder.get_token(),
+            None
+        )
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+from transformers import is_torch_available
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
+if is_torch_available():
+    from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
+                              AlbertForSequenceClassification, AlbertForQuestionAnswering,
+                              )
+    from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+@require_torch
+class AlbertModelTest(CommonTestCases.CommonModelTester):
+    all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
+    class AlbertModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     embedding_size=16,
+                     hidden_size=36,
+                     num_hidden_layers=6,
+                     num_hidden_groups=6,
+                     num_attention_heads=6,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.embedding_size = embedding_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+            self.num_hidden_groups = num_hidden_groups
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            config = AlbertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range,
+                num_hidden_groups=self.num_hidden_groups)
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+        def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = AlbertModel(config=config)
+            model.to(torch_device)
+            model.eval()
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
+            sequence_output, pooled_output = model(input_ids)
+            result = {
+                "sequence_output": sequence_output,
+                "pooled_output": pooled_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+        def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = AlbertForMaskedLM(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+        def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = AlbertForQuestionAnswering(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+            result = {
+                "loss": loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.check_loss_output(result)
+        def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = AlbertForSequenceClassification(config)
+            model.to(torch_device)
+            model.eval()
+            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_labels])
+            self.check_loss_output(result)
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = AlbertModelTest.AlbertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_albert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_model(*config_and_inputs)
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
+    @slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = AlbertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -18,11 +18,12 @@ from __future__ import print_function
 import unittest
 import shutil
-import pytest
 import logging
 from transformers import is_torch_available
+from .utils import require_torch, slow
 if is_torch_available():
    from transformers import (AutoConfig, BertConfig,
                                    AutoModel, BertModel,
@@ -33,12 +34,11 @@ if is_torch_available():
    from .modeling_common_test import (CommonTestCases, ids_tensor)
    from .configuration_common_test import ConfigTester
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
+@require_torch
 class AutoModelTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -53,7 +53,7 @@ class AutoModelTest(unittest.TestCase):
            for value in loading_info.values():
                self.assertEqual(len(value), 0)
-    @pytest.mark.slow
+    @slow
    def test_lmhead_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -66,7 +66,7 @@ class AutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForMaskedLM)
-    @pytest.mark.slow
+    @slow
    def test_sequence_classification_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
@@ -79,7 +79,7 @@ class AutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForSequenceClassification)
-    @pytest.mark.slow
+    @slow
    def test_question_answering_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:

--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -18,12 +18,12 @@ from __future__ import print_function
 import unittest
 import shutil
-import pytest
 from transformers import is_torch_available
 from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 if is_torch_available():
    from transformers import (BertConfig, BertModel, BertForMaskedLM,
@@ -31,10 +31,9 @@ if is_torch_available():
                              BertForQuestionAnswering, BertForSequenceClassification,
                              BertForTokenClassification, BertForMultipleChoice)
    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
+@require_torch
 class BertModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
@@ -141,6 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertModel(config=config)
+            model.to(torch_device)
            model.eval()
            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
@@ -157,6 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
            model = BertModel(config)
+            model.to(torch_device)
            model.eval()
            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
@@ -173,6 +174,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForMaskedLM(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
            result = {
@@ -186,6 +188,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
            model = BertForMaskedLM(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
@@ -200,6 +203,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForNextSentencePrediction(config=config)
+            model.to(torch_device)
            model.eval()
            loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
            result = {
@@ -213,6 +217,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForPreTraining(config=config)
+            model.to(torch_device)
            model.eval()
            loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                                    masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
@@ -231,6 +236,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            model = BertForQuestionAnswering(config=config)
+            model.to(torch_device)
            model.eval()
            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
                                                   start_positions=sequence_labels, end_positions=sequence_labels)
@@ -250,6 +256,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = BertForSequenceClassification(config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
            result = {
@@ -264,6 +271,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_labels = self.num_labels
            model = BertForTokenClassification(config=config)
+            model.to(torch_device)
            model.eval()
            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
            result = {
@@ -278,6 +286,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
            config.num_choices = self.num_choices
            model = BertForMultipleChoice(config=config)
+            model.to(torch_device)
            model.eval()
            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
@@ -349,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        cache_dir = "/tmp/transformers_test/"
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:

--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -27,19 +27,18 @@ import uuid
 import unittest
 import logging
-import pytest
 from transformers import is_torch_available
+from .utils import require_torch, slow, torch_device
 if is_torch_available():
    import torch
    import numpy as np
-    from transformers import (PretrainedConfig, PreTrainedModel,
+    from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
                                    BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                    GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 if sys.version_info[0] == 2:
    import cPickle as pickle
@@ -65,6 +64,7 @@ def _config_zero_init(config):
 class CommonTestCases:
+    @require_torch
    class CommonModelTester(unittest.TestCase):
        model_tester = None
@@ -80,6 +80,7 @@ class CommonTestCases:
            for model_class in self.all_model_classes:
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                with torch.no_grad():
                    outputs = model(**inputs_dict)
@@ -89,18 +90,17 @@ class CommonTestCases:
                with TemporaryDirectory() as tmpdirname:
                    model.save_pretrained(tmpdirname)
                    model = model_class.from_pretrained(tmpdirname)
+                    model.to(torch_device)
-                with torch.no_grad():
+                    with torch.no_grad():
-                    after_outputs = model(**inputs_dict)
+                        after_outputs = model(**inputs_dict)
-                # # Make sure we don't have nans
+                    # Make sure we don't have nans
-                out_1 = after_outputs[0].numpy()
+                    out_1 = after_outputs[0].cpu().numpy()
-                out_1[np.isnan(out_1)] = 0
+                    out_2 = outputs[0].cpu().numpy()
+                    out_1 = out_1[~np.isnan(out_1)]
-                out_1 = out_1 - out_2
+                    out_2 = out_2[~np.isnan(out_2)]
-                amax = np.amax(out_1)
+                    max_diff = np.amax(np.abs(out_1 - out_2))
-                amin = np.amin(out_1)
+                    self.assertLessEqual(max_diff, 1e-5)
-                self.assertLessEqual(max(amax, -amin), 1e-5)
        def test_initialization(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -118,6 +118,7 @@ class CommonTestCases:
            for model_class in self.all_model_classes:
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                first, second = model(**inputs_dict)[0], model(**inputs_dict)[0]
                self.assertEqual(first.ne(second).sum().item(), 0)
@@ -134,6 +135,7 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                outputs = model(**inputs_dict)
                attentions = outputs[-1]
@@ -164,6 +166,7 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                outputs = model(**inputs_dict)
                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
@@ -203,6 +206,7 @@ class CommonTestCases:
            configs_no_init.torchscript = True
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
+                model.to(torch_device)
                model.eval()
                inputs = inputs_dict['input_ids']  # Let's keep only input_ids
@@ -223,7 +227,10 @@ class CommonTestCases:
                except ValueError:
                    self.fail("Couldn't load module.")
+                model.to(torch_device)
                model.eval()
+                loaded_model.to(torch_device)
                loaded_model.eval()
                model_params = model.parameters()
@@ -249,11 +256,12 @@ class CommonTestCases:
            configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
+                model.to(torch_device)
                model.eval()
                # Prepare head_mask
                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
-                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
+                head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
                head_mask[0, 0] = 0
                head_mask[-1, :-1] = 0
                head_mask.requires_grad_(requires_grad=True)
@@ -302,6 +310,7 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()
                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
@@ -330,6 +339,7 @@ class CommonTestCases:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()
                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
@@ -339,6 +349,7 @@ class CommonTestCases:
                    os.makedirs(directory)
                model.save_pretrained(directory)
                model = model_class.from_pretrained(directory)
+                model.to(torch_device)
                outputs = model(**inputs_dict)
                attentions = outputs[-1]
@@ -366,6 +377,7 @@ class CommonTestCases:
                config.pruned_heads = heads_to_prune
                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()
                outputs = model(**inputs_dict)
@@ -392,6 +404,7 @@ class CommonTestCases:
                config.pruned_heads = heads_to_prune
                model = model_class(config=config)
+                model.to(torch_device)
                model.eval()
                outputs = model(**inputs_dict)
@@ -408,6 +421,7 @@ class CommonTestCases:
                    os.makedirs(directory)
                model.save_pretrained(directory)
                model = model_class.from_pretrained(directory)
+                model.to(torch_device)
                shutil.rmtree(directory)
                outputs = model(**inputs_dict)
@@ -438,6 +452,7 @@ class CommonTestCases:
                config.output_hidden_states = True
                config.output_attentions = False
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                outputs = model(**inputs_dict)
                hidden_states = outputs[-1]
@@ -488,9 +503,15 @@ class CommonTestCases:
            for model_class in self.all_model_classes:
                model = model_class(config)
-                model.get_input_embeddings()
+                self.assertIsInstance(
+                    model.get_input_embeddings(),
+                    (torch.nn.Embedding, AdaptiveEmbedding)
+                )
                model.set_input_embeddings(torch.nn.Embedding(10, 10))
-                model.get_output_embeddings()
+                x = model.get_output_embeddings()
+                self.assertTrue(
+                    x is None or isinstance(x, torch.nn.Linear)
+                )
        def test_tie_model_weights(self):
            if not self.test_torchscript:
@@ -545,6 +566,20 @@ class CommonTestCases:
                # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
                # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
+        def test_inputs_embeds(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            input_ids = inputs_dict["input_ids"]
+            del inputs_dict["input_ids"]
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+                wte = model.get_input_embeddings()
+                inputs_dict["inputs_embeds"] = wte(input_ids)
+                outputs = model(**inputs_dict)
    class GPTModelTester(CommonModelTester):
@@ -629,6 +664,7 @@ class CommonTestCases:
        def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
                                mc_labels, lm_labels, mc_token_ids):
            model = self.base_model_class(config)
+            model.to(torch_device)
            model.eval()
            outputs = model(input_ids, position_ids, token_type_ids)
@@ -644,6 +680,7 @@ class CommonTestCases:
        def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_ids):
            model = self.lm_head_model_class(config)
+            model.to(torch_device)
            model.eval()
            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
            loss, lm_logits = outputs[:2]
@@ -660,6 +697,7 @@ class CommonTestCases:
                                        mc_labels, lm_labels, mc_token_ids):
            for model_class in self.all_model_classes:
                model = model_class(config)
+                model.to(torch_device)
                model.eval()
                outputs = model(input_ids)
                presents = outputs[-1]
@@ -672,6 +710,7 @@ class CommonTestCases:
        def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
                                        mc_labels, lm_labels, mc_token_ids):
            model = self.double_head_model_class(config)
+            model.to(torch_device)
            model.eval()
            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                            token_type_ids=token_type_ids, position_ids=position_ids)
@@ -717,7 +756,7 @@ class CommonTestCases:
                config_and_inputs = self.prepare_config_and_inputs()
                self.create_and_check_presents(*config_and_inputs)
-        @pytest.mark.slow
+        @slow
        def run_slow_tests(self):
            self.create_and_check_model_from_pretrained()
@@ -771,7 +810,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
    for _ in range(total_dims):
        values.append(rng.randint(0, vocab_size - 1))
-    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
+    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
 def floats_tensor(shape, scale=1.0, rng=None, name=None):
@@ -787,11 +826,12 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
    for _ in range(total_dims):
        values.append(rng.random() * scale)
-    return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
+    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
+@require_torch
 class ModelUtilsTest(unittest.TestCase):
-    @pytest.mark.slow
+    @slow
    def test_model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: