Merge branch 'master' into saving-and-resuming

eeb70cdd · Thomas Wolf · GitHub · 6aa91946 · ed9b8481 · eeb70cdd
Unverified Commit eeb70cdd authored Dec 21, 2019 by Thomas Wolf Committed by GitHub Dec 21, 2019
20 changed files
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -366,7 +366,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        self.use_bfloat16 = config.use_bfloat16
        self.initializer_range = config.initializer_range

-        self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
+        self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
        self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
        self.dropout = tf.keras.layers.Dropout(config.dropout)

@@ -552,7 +552,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " \
            "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
        if input_mask is None and attention_mask is not None:
-            input_mask = 1.0 - attention_mask
+            input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float)
        if input_mask is not None and perm_mask is not None:
            data_mask = input_mask[None] + perm_mask
        elif input_mask is not None and perm_mask is None:
@@ -811,7 +811,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel):

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = TFXLNetModel.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

@@ -855,7 +855,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
        model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')

        # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>"))[None, :]  # We will predict the masked token
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
        perm_mask = tf.zeros((1, input_ids.shape[1], input_ids.shape[1]))
        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
        target_mapping = tf.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
@@ -911,7 +911,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        logits = outputs[0]

@@ -1022,7 +1022,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        start_scores, end_scores = outputs[:2]

@@ -1086,7 +1086,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):

 #         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
 #         model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
-#         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+#         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
 #         start_positions = tf.constant([1])
 #         end_positions = tf.constant([3])
 #         outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)

--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -36,7 +36,7 @@ from torch.nn.parameter import Parameter

 from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary
 from .configuration_transfo_xl import TransfoXLConfig
-from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
+from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits, LogUniformSampler
 from .file_utils import add_start_docstrings

 logger = logging.getLogger(__name__)
@@ -582,7 +582,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):

        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states, mems = outputs[:2]

@@ -592,14 +592,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states

-        self.n_token = config.n_token
+        self.n_token = config.vocab_size

        self.d_embed = config.d_embed
        self.d_model = config.d_model
        self.n_head = config.n_head
        self.d_head = config.d_head

-        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
+        self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
                                          div_val=config.div_val)

        self.drop = nn.Dropout(config.dropout)
@@ -825,7 +825,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):

        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        prediction_scores, mems = outputs[:2]

@@ -836,11 +836,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
        self.sample_softmax = config.sample_softmax
        # use sampled softmax
        if config.sample_softmax > 0:
-            self.out_layer = nn.Linear(config.d_model, config.n_token)
-            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
+            self.out_layer = nn.Linear(config.d_model, config.vocab_size)
+            self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
        # use adaptive softmax (including standard softmax)
        else:
-            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
+            self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model,
                                                    config.cutoffs, div_val=config.div_val)
        self.init_weights()

@@ -908,3 +908,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
                outputs = [softmax_output, None] + outputs

        return outputs  # (loss), logits or None if labels is not None (speed up adaptive softmax), new_mems, (all hidden states), (all attentions)
+
+    def get_output_embeddings(self):
+        """ Double-check if you are using adaptive softmax.
+        """
+        if self.sample_softmax > 0:
+            return self.out_layer
+        else:
+            return self.crit.out_layers[-1]
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -31,11 +31,11 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F

 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
+                         cached_path, hf_bucket_url, is_remote_url)

 logger = logging.getLogger(__name__)

-
 try:
    from torch.nn import Identity
 except ImportError:
@@ -71,6 +71,15 @@ class PreTrainedModel(nn.Module):
    load_tf_weights = lambda model, config, path: None
    base_model_prefix = ""

+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to do a forward pass in the network.
+
+        Returns:
+            torch.Tensor with dummy inputs
+        """
+        return {'input_ids': torch.tensor(DUMMY_INPUTS)}
+
    def __init__(self, config, *inputs, **kwargs):
        super(PreTrainedModel, self).__init__()
        if not isinstance(config, PretrainedConfig):
@@ -160,8 +169,7 @@ class PreTrainedModel(nn.Module):
        base_model.vocab_size = new_num_tokens

        # Tie weights again if needed
-        if hasattr(self, 'tie_weights'):
-            self.tie_weights()
+        self.tie_weights()

        return model_embeds

@@ -265,6 +273,7 @@ class PreTrainedModel(nn.Module):
            pretrained_model_name_or_path: either:

                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
@@ -272,7 +281,9 @@ class PreTrainedModel(nn.Module):
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method

-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) one of:
+                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
+                    - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:

                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
@@ -318,11 +329,6 @@ class PreTrainedModel(nn.Module):
            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)

        """
-        if pretrained_model_name_or_path is not None and (
-                "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path):
-            logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
-                           "https://github.com/google-research/google-research/issues/119 for more information.")
-
        config = kwargs.pop('config', None)
        state_dict = kwargs.pop('state_dict', None)
        cache_dir = kwargs.pop('cache_dir', None)
@@ -332,10 +338,11 @@ class PreTrainedModel(nn.Module):
        proxies = kwargs.pop('proxies', None)
        output_loading_info = kwargs.pop('output_loading_info', False)

-        # Load config
-        if config is None:
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
            config, model_kwargs = cls.config_class.from_pretrained(
-                pretrained_model_name_or_path, *model_args,
+                config_path, *model_args,
                cache_dir=cache_dir, return_unused_kwargs=True,
                force_download=force_download,
                resume_download=resume_download,
@@ -363,11 +370,16 @@ class PreTrainedModel(nn.Module):
                    raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
                        pretrained_model_name_or_path))
-            elif os.path.isfile(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                archive_file = pretrained_model_name_or_path
-            else:
-                assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index")
                archive_file = pretrained_model_name_or_path + ".index"
+            else:
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
+                if from_tf:
+                    raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")

            # redirect to the cache, if necessary
            try:
@@ -399,7 +411,11 @@ class PreTrainedModel(nn.Module):
        model = cls(config, *model_args, **model_kwargs)

        if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+            try:
+                state_dict = torch.load(resolved_archive_file, map_location='cpu')
+            except:
+                raise OSError("Unable to load weights from pytorch checkpoint file. "
+                              "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ")

        missing_keys = []
        unexpected_keys = []
@@ -428,8 +444,6 @@ class PreTrainedModel(nn.Module):
                    new_key = key.replace('gamma', 'weight')
                if 'beta' in key:
                    new_key = key.replace('beta', 'bias')
-                if key == 'lm_head.decoder.weight':
-                    new_key = 'lm_head.weight'
                if new_key:
                    old_keys.append(key)
                    new_keys.append(new_key)
@@ -471,8 +485,7 @@ class PreTrainedModel(nn.Module):
                raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                model.__class__.__name__, "\n\t".join(error_msgs)))

-        if hasattr(model, 'tie_weights'):
-            model.tie_weights()  # make sure word embedding weights are still tied
+        model.tie_weights()  # make sure word embedding weights are still tied if needed

        # Set model in evaluation mode to desactivate DropOut modules by default
        model.eval()
@@ -483,6 +496,403 @@ class PreTrainedModel(nn.Module):

        return model

+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {"input_ids": input_ids}
+
+    @torch.no_grad()
+    def generate(self, input_ids=None, max_length=None, do_sample=None, num_beams=None,
+                 temperature=None, top_k=None, top_p=None, repetition_penalty=None,
+                 bos_token_id=None, pad_token_id=None, eos_token_ids=None,
+                 length_penalty=None, num_return_sequences=None):
+        """ Sequence generator for models with a LM head.
+
+        The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
+        and beam-search.
+
+        Adapted in part from Facebook's XLM beam search code: https://github.com/facebookresearch/XLM
+
+        Params:
+            **input_ids**: (`optional`) `torch.LongTensor` of shape (1, sequence_length)
+                The sequence used as a prompt for the generation. If `None` the method initializes
+                it as an empty `torch.LongTensor` of shape (1,)
+            **max_length**: (`optional`) int
+                The max length of the sequence to be generated.  Between 1 and infinity. Default to 20.
+            **do_sample**: (`optional`) bool
+                If set to `False` we use greedy decoding; otherwise sampling. Default to greedy sampling.
+            **num_beams**: (`optional`) int
+                Number of beams for beam search. 1 means no beam serach. Default to 1.
+            **temperature**: (`optional`) float
+                The value used to module the next token probabilities.
+            **top_k**: (`optional`) int
+                The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
+            **top_p**: (`optional`) float
+                The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
+            **repetition_penalty**: (`optional`) float
+                The parameter for repetition penalty. Between 1.0 and + infinity. 1.0 means no penalty. Default to 1.
+            **bos_token_id**: (`optional`) int
+                Beginning of sentence token if no prompt is provided. Default to 0.
+            **eos_token_ids**: (`optional`) int or list of int
+                End of sequence token or list of tokens to stop the generation. Default to 0.
+            **length_penalty**: (`optional`) int
+                Exponential penalty to the length. Default to 0.
+            **length_penalty**: (`optional`) float
+                Exponential penalty to the length. Default to 1.
+            **num_return_sequences**: (`optional`) int
+                The number of independantly computed returned sequences for each element in the batch. Default to 1.
+        """
+
+        # We cannot generate if the model does not have a LM head
+        if self.get_output_embeddings() is None:
+            raise AttributeError("You tried to generate sequences with a model that does not have a LM Head."
+                                 "Please use another model class (e.g. `OpenAIGPTLMHeadModel`)")
+
+        max_length = max_length if max_length is not None else self.config.max_length
+        do_sample = do_sample if do_sample is not None else self.config.do_sample
+        num_beams = num_beams if num_beams is not None else self.config.num_beams
+        temperature = temperature if temperature is not None else self.config.temperature
+        top_k = top_k if top_k is not None else self.config.top_k
+        top_p = top_p if top_p is not None else self.config.top_p
+        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.eos_token_ids
+        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+        num_return_sequences = num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]  # overriden by the input batch_size
+        else:
+            batch_size = 1
+        if isinstance(eos_token_ids, int):
+            eos_token_ids = [eos_token_ids]
+
+        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
+        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
+        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer."
+        # assert temperature >= 0, "`temperature` should be positive."
+        assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
+        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
+        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
+        assert isinstance(bos_token_id, int) and bos_token_id >= 0, "`bos_token_id` should be a positive integer."
+        assert isinstance(pad_token_id, int) and pad_token_id >= 0, "`pad_token_id` should be a positive integer."
+        assert isinstance(eos_token_ids, (list, tuple)) and (e >= 0 for e in eos_token_ids), \
+                   "`eos_token_ids` should be a positive integer or a list/tuple of positive integers."
+        assert length_penalty > 0, "`length_penalty` should be strictely positive."
+        assert isinstance(num_return_sequences, int) and num_return_sequences > 0, "`num_return_sequences` should be a strictely positive integer."
+
+        if input_ids is None:
+            input_ids = torch.full((batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device)
+        else:
+            assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
+
+        # current position and vocab size
+        cur_len = input_ids.shape[1]
+        vocab_size = self.config.vocab_size
+
+        if num_return_sequences != 1:
+            # Expand input to num return sequences
+            input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len)
+            input_ids = input_ids.contiguous().view(batch_size * num_return_sequences, cur_len)   # (batch_size * num_return_sequences, cur_len)
+            effective_batch_size = batch_size * num_return_sequences
+        else:
+            effective_batch_size = batch_size
+
+        if num_beams > 1:
+            output = self._generate_beam_search(input_ids, cur_len, max_length, do_sample,
+                                                temperature, top_k, top_p, repetition_penalty,
+                                                pad_token_id, eos_token_ids, effective_batch_size,
+                                                length_penalty, num_beams, vocab_size)
+        else:
+            output = self._generate_no_beam_search(input_ids, cur_len, max_length, do_sample,
+                                             temperature, top_k, top_p, repetition_penalty,
+                                             pad_token_id, eos_token_ids, effective_batch_size)
+
+        if num_return_sequences != 1:
+            output = output.view(batch_size, num_return_sequences, -1)
+        return output
+
+    def _generate_no_beam_search(self, input_ids, cur_len, max_length, do_sample,
+                                 temperature, top_k, top_p, repetition_penalty,
+                                 pad_token_id, eos_token_ids, batch_size):
+        """ Generate sequences for each example without beam search (num_beams == 1).
+            All returned sequence are generated independantly.
+        """
+        # current position / max lengths / length of generated sentences / unfinished sentences
+        unfinished_sents = input_ids.new(batch_size).fill_(1)
+
+        # TODO: add cached compute states
+        pasts = None
+
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts)
+            outputs = self(**model_inputs)
+            next_token_logits = outputs[0][:, -1, :]
+
+            # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
+            if repetition_penalty != 1.0:
+                for i in range(batch_size):
+                    for previous_tokens in set(input_ids[i].tolist()):
+                        next_token_logits[i, previous_tokens] /= repetition_penalty
+
+            if do_sample:
+                # Temperature (higher temperature => more likely to sample low probability tokens)
+                if temperature > 0 and temperature != 1.0:
+                    next_token_logits = next_token_logits / temperature
+                # Top-p/top-k filtering
+                next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+                # Sample
+                next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1).squeeze(1)
+            else:
+                # Greedy decoding
+                next_token = torch.argmax(next_token_logits, dim=-1)
+
+            # update generations and finished sentences
+            tokens_to_add = next_token * unfinished_sents + pad_token_id * (1 - unfinished_sents)
+            input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1)
+            for eos_token_id in eos_token_ids:
+                unfinished_sents.mul_(tokens_to_add.ne(eos_token_id).long())
+            cur_len = cur_len + 1
+
+            # stop when there is a </s> in each sentence, or if we exceed the maximul length
+            if unfinished_sents.max() == 0:
+                break
+
+        # add eos_token_ids to unfinished sentences
+        if cur_len == max_length:
+            input_ids[:, -1].masked_fill_(unfinished_sents.to(dtype=torch.bool), eos_token_ids[0])
+
+        return input_ids
+
+    def _generate_beam_search(self, input_ids, cur_len, max_length, do_sample,
+                              temperature, top_k, top_p, repetition_penalty,
+                              pad_token_id, eos_token_ids, batch_size,
+                              length_penalty, num_beams, vocab_size):
+        """ Generate sequences for each example with beam search.
+        """
+        # Expand input to num beams
+        input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len)
+        input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len)   # (batch_size * num_beams, cur_len)
+
+        # generated hypotheses
+        generated_hyps = [BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size)]
+
+        # scores for each sentence in the beam
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view(-1)                                      # shape (batch_size * num_beams,)
+
+        # cache compute states
+        pasts = None  # self.prepare_pasts()
+
+        # done sentences
+        done = [False for _ in range(batch_size)]
+
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts)
+            scores = self(**model_inputs)[0]                                    # (batch_size * num_beams, cur_len, vocab_size)
+            scores = scores[:, -1, :]                                           # (batch_size * num_beams, vocab_size)
+
+            # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
+            if repetition_penalty != 1.0:
+                for i in range(batch_size * num_beams):
+                    for previous_tokens in set(input_ids[i].tolist()):
+                        scores[i, previous_tokens] /= repetition_penalty
+
+            if do_sample:
+                # Temperature (higher temperature => more likely to sample low probability tokens)
+                if temperature > 0 and temperature != 1.0:
+                    scores = scores / temperature
+                # Top-p/top-k filtering
+                scores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2)  # (batch_size * num_beams, vocab_size)
+                # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search)
+                next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2)    # (batch_size * num_beams, 2)
+                # Compute next scores
+                _scores = F.log_softmax(scores, dim=-1)                                     # (batch_size * num_beams, vocab_size)
+                _scores = torch.gather(_scores, -1, next_words)                             # (batch_size * num_beams, 2)
+                next_scores = _scores + beam_scores[:, None].expand_as(_scores)             # (batch_size * num_beams, 2)
+                # Match shape of greedy beam search
+                next_words = next_words.view(batch_size, 2 * num_beams)                     # (batch_size, 2 * num_beams)
+                next_scores = next_scores.view(batch_size, 2 * num_beams)                   # (batch_size, 2 * num_beams)
+            else:
+                # do greedy beam search
+                scores = F.log_softmax(scores, dim=-1)                          # (batch_size * num_beams, vocab_size)
+                assert scores.size() == (batch_size * num_beams, vocab_size)
+                # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
+                _scores = scores + beam_scores[:, None].expand_as(scores)       # (batch_size * num_beams, vocab_size)
+                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+                _scores = _scores.view(batch_size, num_beams * vocab_size)      # (batch_size, num_beams * vocab_size)
+                next_scores, next_words = torch.topk(_scores, 2*num_beams, dim=1, largest=True, sorted=True)
+
+            assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams)
+
+            # next batch beam content
+            # list of (batch_size * num_beams) tuple(next hypothesis score, next word, current position in the batch)
+            next_batch_beam = []
+
+            # for each sentence
+            for batch_ex in range(batch_size):
+
+                # if we are done with this sentence
+                done[batch_ex] = done[batch_ex] or generated_hyps[batch_ex].is_done(next_scores[batch_ex].max().item())
+                if done[batch_ex]:
+                    next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
+                    continue
+
+                # next sentence beam content
+                next_sent_beam = []
+
+                # next words for this sentence
+                for idx, score in zip(next_words[batch_ex], next_scores[batch_ex]):
+
+                    # get beam and word IDs
+                    beam_id = idx // vocab_size
+                    word_id = idx % vocab_size
+
+                    # end of sentence, or next word
+                    if word_id.item() in eos_token_ids or cur_len + 1 == max_length:
+                        generated_hyps[batch_ex].add(input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item())
+                    else:
+                        next_sent_beam.append((score, word_id, batch_ex * num_beams + beam_id))
+
+                    # the beam for next step is full
+                    if len(next_sent_beam) == num_beams:
+                        break
+
+                # update next beam content
+                assert len(next_sent_beam) == 0 if cur_len + 1 == max_length else num_beams
+                if len(next_sent_beam) == 0:
+                    next_sent_beam = [(0, pad_token_id, 0)] * num_beams  # pad the batch
+                next_batch_beam.extend(next_sent_beam)
+                assert len(next_batch_beam) == num_beams * (batch_ex + 1)
+
+            # sanity check / prepare next batch
+            assert len(next_batch_beam) == batch_size * num_beams
+            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+            beam_words = input_ids.new([x[1] for x in next_batch_beam])
+            beam_idx = input_ids.new([x[2] for x in next_batch_beam])
+
+            # re-order batch and internal states
+            input_ids = input_ids[beam_idx, :]
+            input_ids = torch.cat([input_ids, beam_words.unsqueeze(1)], dim=-1)
+            # TODO: Activate cache
+            # for k in cache.keys():
+            #     if k != 'slen':
+            #         cache[k] = (cache[k][0][beam_idx], cache[k][1][beam_idx])
+
+            # update current length
+            cur_len = cur_len + 1
+
+            # stop when we are done with each sentence
+            if all(done):
+                break
+
+        # visualize hypotheses
+        # print([len(x) for x in generated_hyps], cur_len)
+        # globals().update( locals() );
+        # !import code; code.interact(local=vars())
+        # for ii in range(batch_size):
+        #     for ss, ww in sorted(generated_hyps[ii].hyp, key=lambda x: x[0], reverse=True):
+        #         print("%.3f " % ss + " ".join(self.dico[x] for x in ww.tolist()))
+        #     print("")
+
+        # select the best hypotheses
+        tgt_len = input_ids.new(batch_size)
+        best = []
+
+        for i, hypotheses in enumerate(generated_hyps):
+            best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1]
+            tgt_len[i] = len(best_hyp) + 1  # +1 for the <EOS> symbol
+            best.append(best_hyp)
+
+        # generate target batch
+        decoded = input_ids.new(batch_size, tgt_len.max().item()).fill_(pad_token_id)
+        for i, hypo in enumerate(best):
+            decoded[i, :tgt_len[i] - 1] = hypo
+            decoded[i, tgt_len[i] - 1] = eos_token_ids[0]
+
+        return decoded
+
+
+def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf'), min_tokens_to_keep=1):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (batch size, vocabulary size)
+            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+            Make sure we keep at least min_tokens_to_keep per batch example in the output
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits
+
+
+class BeamHypotheses(object):
+
+    def __init__(self, n_hyp, max_length, length_penalty, early_stopping):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.max_length = max_length - 1  # ignoring bos_token
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.n_hyp = n_hyp
+        self.hyp = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.hyp)
+
+    def add(self, hyp, sum_logprobs):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / len(hyp) ** self.length_penalty
+        if len(self) < self.n_hyp or score > self.worst_score:
+            self.hyp.append((score, hyp))
+            if len(self) > self.n_hyp:
+                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.hyp)])
+                del self.hyp[sorted_scores[0][1]]
+                self.worst_score = sorted_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs):
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated
+        can become better than the worst one in the heap, then we are done with this sentence.
+        """
+        if len(self) < self.n_hyp:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            return self.worst_score >= best_sum_logprobs / self.max_length ** self.length_penalty
+

 class Conv1D(nn.Module):
    def __init__(self, nf, nx):

--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel):
    def __init__(self, *inputs, **kwargs):
        super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)

+    @property
+    def dummy_inputs(self):
+        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        else:
+            langs_list = None
+        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
+
    def _init_weights(self, module):
        """ Initialize the weights. """
        if isinstance(module, nn.Embedding):
@@ -336,7 +346,7 @@ class XLMModel(XLMPreTrainedModel):

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

@@ -624,7 +634,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

@@ -639,6 +649,18 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
    def get_output_embeddings(self):
        return self.pred_layer.proj

+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        mask_token_id = self.config.mask_token_id
+        lang_id = self.config.lang_id
+
+        mask_token = torch.full((1, 1), mask_token_id, dtype=torch.long, device=input_ids.device)
+        input_ids = torch.cat([input_ids, mask_token], dim=1)
+        if lang_id is not None:
+            langs = torch.full_like(input_ids, lang_id)
+        else:
+            langs = None
+        return {"input_ids": input_ids, "langs": langs}
+
    def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
                lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
@@ -646,7 +668,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
                                               langs=langs,
                                               token_type_ids=token_type_ids,
                                               position_ids=position_ids,
-                                               lengths=lengths, 
+                                               lengths=lengths,
                                               cache=cache,
                                               head_mask=head_mask,
                                               inputs_embeds=inputs_embeds)
@@ -686,7 +708,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel):

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]
@@ -770,7 +792,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
@@ -866,7 +888,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)

--- a/transformers/modeling_xlm_roberta.py
+++ b/transformers/modeling_xlm_roberta.py
+# coding=utf-8
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XLM-RoBERTa model. """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
+from .configuration_xlm_roberta import XLMRobertaConfig
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin",
+    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin",
+}
+
+
+XLM_ROBERTA_START_DOCSTRING = r"""    The XLM-RoBERTa model was proposed in
+    `Unsupervised Cross-lingual Representation Learning at Scale`_
+    by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
+    
+    It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
+
+    This implementation is the same as RoBERTa.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Unsupervised Cross-lingual Representation Learning at Scale`:
+        https://arxiv.org/abs/1911.02116
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the 
+            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+XLM_ROBERTA_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, XLM-RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
+
+            (b) For single sequences:
+
+                ``tokens:         <s> the dog is hairy . </s>``
+
+            Fully encoded sequences or sequence pairs can be obtained using the XLMRobertaTokenizer.encode function with 
+            the ``add_special_tokens`` parameter set to ``True``.
+
+            XLM-RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional segment token indices to indicate first and second portions of the inputs.
+            This embedding matrice is not trained (not pretrained during XLM-RoBERTa pretraining), you will have to train it
+            during finetuning.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+"""
+
+@add_start_docstrings("The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+                      XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaModel(RobertaModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            eo match pre-training, XLM-RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         <s> is this jack ##son ##ville ? </s> </s> no it is not . </s>``
+
+                ``token_type_ids:   0   0  0    0    0     0       0   0   0     1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         <s> the dog is hairy . </s>``
+
+                ``token_type_ids:   0   0   0   0  0     0   0``
+
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaModel.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model with a `language modeling` head on top. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForMaskedLM(RobertaForMaskedLM):
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForMaskedLM.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+    on top of the pooled output) e.g. for GLUE tasks. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above).
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForMultipleChoice.from_pretrained('xlm-roberta-large')
+        choices = ["Schloß Nymphenburg ist sehr schön .", "Der Schloßkanal auch !"]
+        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, classification_scores = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForTokenClassification(RobertaForTokenClassification):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -589,7 +589,7 @@ class XLNetModel(XLNetPreTrainedModel):

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = XLNetModel.from_pretrained('xlnet-large-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

@@ -609,7 +609,7 @@ class XLNetModel(XLNetPreTrainedModel):
        self.clamp_len = config.clamp_len
        self.n_layer = config.n_layer

-        self.word_embedding = nn.Embedding(config.n_token, config.d_model)
+        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
        self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
        self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
        self.dropout = nn.Dropout(config.dropout)
@@ -925,7 +925,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
        # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True)).unsqueeze(0)  # We will predict the masked token
        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
@@ -940,13 +940,37 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        self.same_length = config.same_length

        self.transformer = XLNetModel(config)
-        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
+        self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)

        self.init_weights()

    def get_output_embeddings(self):
        return self.lm_loss

+    def prepare_inputs_for_generation(self, input_ids, **model_kwargs):
+        # Add dummy token at the end (no attention on this one)
+        dummy_token = torch.zeros((1, 1), dtype=torch.long, device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        # Build permutation mask so that previous tokens don't see last token
+        perm_mask = torch.zeros(
+            (input_ids.shape[0], input_ids.shape[1], input_ids.shape[1]),
+            dtype=torch.float, device=input_ids.device
+        )
+        perm_mask[:, :, -1] = 1.0
+
+        # We'll only predict the last token
+        target_mapping = torch.zeros(
+            (input_ids.shape[0], 1, input_ids.shape[1]),
+            dtype=torch.float, device=input_ids.device
+        )
+        target_mapping[0, 0, -1] = 1.0
+
+        return {"input_ids": input_ids,
+                "perm_mask": perm_mask,
+                "target_mapping": target_mapping
+               }
+
    def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
                token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
        transformer_outputs = self.transformer(input_ids,
@@ -1007,7 +1031,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]
@@ -1294,7 +1318,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
        model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
@@ -1409,7 +1433,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):

        tokenizer =  XLNetTokenizer.from_pretrained('xlnet-large-cased')
        model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        start_positions = torch.tensor([1])
        end_positions = torch.tensor([3])
        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)

--- a/transformers/pipelines.py
+++ b/transformers/pipelines.py
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import sys
+import csv
+import json
+import os
+import pickle
+import logging
+import six
+
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from itertools import groupby
+from os.path import abspath, exists
+from typing import Union, Optional, Tuple, List, Dict
+
+import numpy as np
+
+from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer,
+                          PretrainedConfig, ModelCard, SquadExample,
+                          squad_convert_examples_to_features, is_tf_available,
+                          is_torch_available, BasicTokenizer,
+                          ALL_PRETRAINED_CONFIG_ARCHIVE_MAP)
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TFAutoModel, TFAutoModelForSequenceClassification, \
+        TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification
+
+if is_torch_available():
+    import torch
+    from transformers import AutoModel, AutoModelForSequenceClassification, \
+        AutoModelForQuestionAnswering, AutoModelForTokenClassification
+
+
+logger = logging.getLogger(__name__)
+
+def get_framework(model=None):
+    """ Select framework (TensorFlow/PyTorch) to use.
+        If both frameworks are installed and no specific model is provided, defaults to using PyTorch.
+    """
+    if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str):
+        # Both framework are available but the use supplied a model class instance.
+        # Try to guess which framework to use from the model classname
+        framework = 'tf' if model.__class__.__name__.startswith('TF') else 'pt'
+    elif not is_tf_available() and not is_torch_available():
+        raise ImportError("At least one of TensorFlow 2.0 or PyTorch should be installed. "
+                          "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+                          "To install PyTorch, read the instructions at https://pytorch.org/.")
+    else:
+        # framework = 'tf' if is_tf_available() else 'pt'
+        framework = 'pt' if is_torch_available() else 'tf'
+    return framework
+
+class ArgumentHandler(ABC):
+    """
+    Base interface for handling varargs for each Pipeline
+    """
+    @abstractmethod
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError()
+
+
+class DefaultArgumentHandler(ArgumentHandler):
+    """
+    Default varargs argument parser handling parameters for each Pipeline
+    """
+    def __call__(self, *args, **kwargs):
+        if 'X' in kwargs:
+            return kwargs['X']
+        elif 'data' in kwargs:
+            return kwargs['data']
+        elif len(args) == 1:
+            if isinstance(args[0], list):
+                return args[0]
+            else:
+                return [args[0]]
+        elif len(args) > 1:
+            return list(args)
+        raise ValueError('Unable to infer the format of the provided data (X=, data=, ...)')
+
+
+class PipelineDataFormat:
+    """
+    Base class for all the pipeline supported data format both for reading and writing.
+    Supported data formats currently includes:
+     - JSON
+     - CSV
+     - stdin/stdout (pipe)
+
+    PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns
+    to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
+    """
+    SUPPORTED_FORMATS = ['json', 'csv', 'pipe']
+
+    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+        self.output_path = output_path
+        self.input_path = input_path
+        self.column = column.split(',') if column is not None else ['']
+        self.is_multi_columns = len(self.column) > 1
+
+        if self.is_multi_columns:
+            self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column]
+
+        if output_path is not None and not overwrite:
+            if exists(abspath(self.output_path)):
+                raise OSError('{} already exists on disk'.format(self.output_path))
+
+        if input_path is not None:
+            if not exists(abspath(self.input_path)):
+                raise OSError('{} doesnt exist on disk'.format(self.input_path))
+
+    @abstractmethod
+    def __iter__(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def save(self, data: dict):
+        """
+        Save the provided data object with the representation for the current `DataFormat`.
+        :param data: data to store
+        :return:
+        """
+        raise NotImplementedError()
+
+    def save_binary(self, data: Union[dict, List[dict]]) -> str:
+        """
+        Save the provided data object as a pickle-formatted binary data on the disk.
+        :param data: data to store
+        :return: (str) Path where the data has been saved
+        """
+        path, _ = os.path.splitext(self.output_path)
+        binary_path = os.path.extsep.join((path, 'pickle'))
+
+        with open(binary_path, 'wb+') as f_output:
+            pickle.dump(data, f_output)
+
+        return binary_path
+
+    @staticmethod
+    def from_str(format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+        if format == 'json':
+            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        elif format == 'csv':
+            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        elif format == 'pipe':
+            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        else:
+            raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(format))
+
+
+class CsvPipelineDataFormat(PipelineDataFormat):
+    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+        super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+    def __iter__(self):
+        with open(self.input_path, 'r') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                if self.is_multi_columns:
+                    yield {k: row[c] for k, c in self.column}
+                else:
+                    yield row[self.column[0]]
+
+    def save(self, data: List[dict]):
+        with open(self.output_path, 'w') as f:
+            if len(data) > 0:
+                writer = csv.DictWriter(f, list(data[0].keys()))
+                writer.writeheader()
+                writer.writerows(data)
+
+
+class JsonPipelineDataFormat(PipelineDataFormat):
+    def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False):
+        super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+        with open(input_path, 'r') as f:
+            self._entries = json.load(f)
+
+    def __iter__(self):
+        for entry in self._entries:
+            if self.is_multi_columns:
+                yield {k: entry[c] for k, c in self.column}
+            else:
+                yield entry[self.column[0]]
+
+    def save(self, data: dict):
+        with open(self.output_path, 'w') as f:
+            json.dump(data, f)
+
+
+class PipedPipelineDataFormat(PipelineDataFormat):
+    """
+    Read data from piped input to the python process.
+    For multi columns data, columns should separated by \t
+
+    If columns are provided, then the output will be a dictionary with {column_x: value_x}
+    """
+    def __iter__(self):
+        for line in sys.stdin:
+            # Split for multi-columns
+            if '\t' in line:
+
+                line = line.split('\t')
+                if self.column:
+                    # Dictionary to map arguments
+                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
+                else:
+                    yield tuple(line)
+
+            # No dictionary to map arguments
+            else:
+                yield line
+
+    def save(self, data: dict):
+        print(data)
+
+    def save_binary(self, data: Union[dict, List[dict]]) -> str:
+        if self.output_path is None:
+            raise KeyError(
+                'When using piped input on pipeline outputting large object requires an output file path. '
+                'Please provide such output path through --output argument.'
+            )
+
+        return super().save_binary(data)
+
+
+class _ScikitCompat(ABC):
+    """
+    Interface layer for the Scikit and Keras compatibility.
+    """
+
+    @abstractmethod
+    def transform(self, X):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def predict(self, X):
+        raise NotImplementedError()
+
+
+class Pipeline(_ScikitCompat):
+    """
+    Base class implementing pipelined operations.
+    Pipeline workflow is defined as a sequence of the following operations:
+        Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output
+
+    Pipeline supports running on CPU or GPU through the device argument. Users can specify
+    device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal.
+
+    Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large
+    tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
+    provide the binary_output constructor argument. If set to True, the output will be stored in the
+    pickle format.
+
+    Arguments:
+        **model**: ``(str, PretrainedModel, TFPretrainedModel)``:
+            Reference to the model to use through this pipeline.
+
+        **tokenizer**: ``(str, PreTrainedTokenizer)``:
+            Reference to the tokenizer to use through this pipeline.
+
+        **args_parser**: ``ArgumentHandler``:
+            Reference to the object in charge of parsing supplied pipeline parameters.
+
+        **device**: ``int``:
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model
+            on the associated CUDA device id.
+
+        **binary_output** ``bool`` (default: False):
+            Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text.
+
+    Return:
+        Pipeline returns list or dictionary depending on:
+         - Does the user provided multiple sample
+         - The pipeline expose multiple fields in the output object
+
+    Examples:
+        nlp = pipeline('ner')
+        nlp = pipeline('ner', model='...', config='...', tokenizer='...')
+        nlp = NerPipeline(model='...', config='...', tokenizer='...')
+        nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...')
+    """
+
+    default_input_names = None
+
+    def __init__(self, model, tokenizer: PreTrainedTokenizer = None,
+                 modelcard: ModelCard = None, framework: Optional[str] = None,
+                 args_parser: ArgumentHandler = None, device: int = -1,
+                 binary_output: bool = False):
+
+        if framework is None:
+            framework = get_framework()
+
+        self.model = model
+        self.tokenizer = tokenizer
+        self.modelcard = modelcard
+        self.framework = framework
+        self.device = device
+        self.binary_output = binary_output
+        self._args_parser = args_parser or DefaultArgumentHandler()
+
+        # Special handling
+        if self.device >= 0 and self.framework == 'pt':
+            self.model = self.model.to('cuda:{}'.format(self.device))
+
+    def save_pretrained(self, save_directory):
+        """
+        Save the pipeline's model and tokenizer to the specified save_directory
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Provided path ({}) should be a directory".format(save_directory))
+            return
+
+        self.model.save_pretrained(save_directory)
+        self.tokenizer.save_pretrained(save_directory)
+        self.modelcard.save_pretrained(save_directory)
+
+    def transform(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+        """
+        return self(X=X)
+
+    def predict(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+        Se
+        """
+        return self(X=X)
+
+    @contextmanager
+    def device_placement(self):
+        """
+        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
+        example:
+            # Explicitly ask for tensor allocation on CUDA device :0
+            nlp = pipeline(..., device=0)
+            with nlp.device_placement():
+                # Every framework specific tensor allocation will be done on the request device
+                output = nlp(...)
+        Returns:
+            Context manager
+        """
+        if self.framework == 'tf':
+            with tf.device('/CPU:0' if self.device == -1 else '/device:GPU:{}'.format(self.device)):
+                yield
+        else:
+            if self.device >= 0:
+                torch.cuda.set_device(self.device)
+
+            yield
+
+    def inputs_for_model(self, features: Union[dict, List[dict]]) -> Dict:
+        """
+        Generates the input dictionary with model-specific parameters.
+
+        Returns:
+            dict holding all the required parameters for model's forward
+        """
+        args = ['input_ids', 'attention_mask']
+        model_type = type(self.model).__name__.lower()
+
+        if 'distilbert' not in model_type and 'xlm' not in model_type:
+            args += ['token_type_ids']
+
+        # PR #1548 (CLI) There is an issue with attention_mask
+        # if 'xlnet' in model_type or 'xlm' in model_type:
+        #     args += ['cls_index', 'p_mask']
+
+        if isinstance(features, dict):
+            return {k: features[k] for k in args}
+        else:
+            return {k: [feature[k] for feature in features] for k in args}
+
+    def __call__(self, *texts, **kwargs):
+        # Parse arguments
+        inputs = self._args_parser(*texts, **kwargs)
+
+        # Encode for forward
+        with self.device_placement():
+            inputs = self.tokenizer.batch_encode_plus(
+                inputs, add_special_tokens=True,
+                return_tensors=self.framework,
+                max_length=self.tokenizer.max_len
+            )
+
+            # Filter out features not available on specific models
+            inputs = self.inputs_for_model(inputs)
+            return self._forward(inputs)
+
+    def _forward(self, inputs):
+        """
+        Internal framework specific forward dispatching.
+        Args:
+            inputs: dict holding all the keyworded arguments for required by the model forward method.
+        Returns:
+            Numpy array
+        """
+        if self.framework == 'tf':
+            # TODO trace model
+            predictions = self.model(inputs, training=False)[0]
+        else:
+            with torch.no_grad():
+                predictions = self.model(**inputs)[0].cpu()
+
+        return predictions.numpy()
+
+
+class FeatureExtractionPipeline(Pipeline):
+    """
+    Feature extraction pipeline using Model head.
+    """
+
+    def __init__(self, model,
+                 tokenizer: PreTrainedTokenizer = None,
+                 modelcard: ModelCard = None,
+                 framework: Optional[str] = None,
+                 args_parser: ArgumentHandler = None,
+                 device: int = -1):
+        super().__init__(model=model,
+                         tokenizer=tokenizer,
+                         modelcard=modelcard,
+                         framework=framework,
+                         args_parser=args_parser,
+                         device=device,
+                         binary_output=True)
+
+    def __call__(self, *args, **kwargs):
+        return super().__call__(*args, **kwargs).tolist()
+
+
+class TextClassificationPipeline(Pipeline):
+    """
+    Text classification pipeline using ModelForTextClassification head.
+    """
+
+    def __call__(self, *args, **kwargs):
+        outputs = super().__call__(*args, **kwargs)
+        scores = np.exp(outputs) / np.exp(outputs).sum(-1)
+        return [{'label': self.model.config.id2label[item.argmax()], 'score': item.max()} for item in scores]
+
+
+class NerPipeline(Pipeline):
+    """
+    Named Entity Recognition pipeline using ModelForTokenClassification head.
+    """
+
+    default_input_names = 'sequences'
+
+    def __init__(self, model, tokenizer: PreTrainedTokenizer = None,
+                 modelcard: ModelCard = None, framework: Optional[str] = None,
+                 args_parser: ArgumentHandler = None, device: int = -1,
+                 binary_output: bool = False, ignore_labels=['O']):
+        super().__init__(model=model,
+                         tokenizer=tokenizer,
+                         modelcard=modelcard,
+                         framework=framework,
+                         args_parser=args_parser,
+                         device=device,
+                         binary_output=binary_output)
+
+        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
+        self.ignore_labels = ignore_labels
+
+    def __call__(self, *texts, **kwargs):
+        inputs, answers = self._args_parser(*texts, **kwargs), []
+        for sentence in inputs:
+
+            # Manage correct placement of the tensors
+            with self.device_placement():
+
+                tokens = self.tokenizer.encode_plus(
+                    sentence, return_attention_mask=False,
+                    return_tensors=self.framework,
+                    max_length=self.tokenizer.max_len
+                )
+
+                # Forward
+                if self.framework == 'tf':
+                    entities = self.model(tokens)[0][0].numpy()
+                    input_ids = tokens['input_ids'].numpy()[0]
+                else:
+                    with torch.no_grad():
+                        entities = self.model(**tokens)[0][0].cpu().numpy()
+                        input_ids = tokens['input_ids'].cpu().numpy()[0]
+
+            score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
+            labels_idx = score.argmax(axis=-1)
+
+            answer = []
+            for idx, label_idx in enumerate(labels_idx):
+                if self.model.config.id2label[label_idx] not in self.ignore_labels:
+                    answer += [{
+                        'word': self.tokenizer.decode([int(input_ids[idx])]),
+                        'score': score[idx][label_idx].item(),
+                        'entity': self.model.config.id2label[label_idx]
+                    }]
+
+            # Append
+            answers += [answer]
+        if len(answers) == 1:
+            return answers[0]
+        return answers
+
+
+class QuestionAnsweringArgumentHandler(ArgumentHandler):
+    """
+    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped
+    to internal SquadExample / SquadFeature structures.
+
+    QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied
+    arguments.
+    """
+    def __call__(self, *args, **kwargs):
+        # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating
+        if args is not None and len(args) > 0:
+            if len(args) == 1:
+                kwargs['X'] = args[0]
+            else:
+                kwargs['X'] = list(args)
+
+        # Generic compatibility with sklearn and Keras
+        # Batched data
+        if 'X' in kwargs or 'data' in kwargs:
+            inputs = kwargs['X'] if 'X' in kwargs else kwargs['data']
+
+            if isinstance(inputs, dict):
+                inputs = [inputs]
+            else:
+                # Copy to avoid overriding arguments
+                inputs = [i for i in inputs]
+
+            for i, item in enumerate(inputs):
+                if isinstance(item, dict):
+                    if any(k not in item for k in ['question', 'context']):
+                        raise KeyError('You need to provide a dictionary with keys {question:..., context:...}')
+
+                    inputs[i] = QuestionAnsweringPipeline.create_sample(**item)
+
+                elif not isinstance(item, SquadExample):
+                    raise ValueError(
+                        '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)'
+                            .format('X' if 'X' in kwargs else 'data')
+                    )
+
+            # Tabular input
+        elif 'question' in kwargs and 'context' in kwargs:
+            if isinstance(kwargs['question'], str):
+                kwargs['question'] = [kwargs['question']]
+
+            if isinstance(kwargs['context'], str):
+                kwargs['context'] = [kwargs['context']]
+
+            inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])]
+        else:
+            raise ValueError('Unknown arguments {}'.format(kwargs))
+
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+
+        return inputs
+
+
+class QuestionAnsweringPipeline(Pipeline):
+    """
+    Question Answering pipeline using ModelForQuestionAnswering head.
+    """
+
+    default_input_names = 'question,context'
+
+    def __init__(self, model,
+                 tokenizer: Optional[PreTrainedTokenizer],
+                 modelcard: Optional[ModelCard],
+                 framework: Optional[str] = None,
+                 device: int = -1, **kwargs):
+        super().__init__(model=model,
+                         tokenizer=tokenizer,
+                         modelcard=modelcard,
+                         framework=framework,
+                         args_parser=QuestionAnsweringArgumentHandler(),
+                         device=device, **kwargs)
+
+    @staticmethod
+    def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]:
+        """
+        QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally.
+        This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s).
+        We currently support extractive question answering.
+        Arguments:
+             question: (str, List[str]) The question to be ask for the associated context
+             context: (str, List[str]) The context in which we will look for the answer.
+
+        Returns:
+            SquadExample initialized with the corresponding question and context.
+        """
+        if isinstance(question, list):
+            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
+        else:
+            return SquadExample(None, question, context, None, None, None)
+
+    def __call__(self, *texts, **kwargs):
+        """
+        Args:
+            We support multiple use-cases, the following are exclusive:
+            X: sequence of SquadExample
+            data: sequence of SquadExample
+            question: (str, List[str]), batch of question(s) to map along with context
+            context: (str, List[str]), batch of context(s) associated with the provided question keyword argument
+        Returns:
+            dict: {'answer': str, 'score": float, 'start": int, "end": int}
+            answer: the textual answer in the intial context
+            score: the score the current answer scored for the model
+            start: the character index in the original string corresponding to the beginning of the answer' span
+            end: the character index in the original string corresponding to the ending of the answer' span
+        """
+        # Set defaults values
+        kwargs.setdefault('topk', 1)
+        kwargs.setdefault('doc_stride', 128)
+        kwargs.setdefault('max_answer_len', 15)
+        kwargs.setdefault('max_seq_len', 384)
+        kwargs.setdefault('max_question_len', 64)
+
+        if kwargs['topk'] < 1:
+            raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk']))
+
+        if kwargs['max_answer_len'] < 1:
+            raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len']))
+
+        # Convert inputs to features
+        examples = self._args_parser(*texts, **kwargs)
+        features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False)
+        fw_args = self.inputs_for_model([f.__dict__ for f in features])
+
+        # Manage tensor allocation on correct device
+        with self.device_placement():
+            if self.framework == 'tf':
+                fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()}
+                start, end = self.model(fw_args)
+                start, end = start.numpy(), end.numpy()
+            else:
+                with torch.no_grad():
+                    # Retrieve the score for the context tokens only (removing question tokens)
+                    fw_args = {k: torch.tensor(v) for (k, v) in fw_args.items()}
+                    start, end = self.model(**fw_args)
+                    start, end = start.cpu().numpy(), end.cpu().numpy()
+
+        answers = []
+        for (example, feature, start_, end_) in zip(examples, features, start, end):
+            # Normalize logits and spans to retrieve the answer
+            start_ = np.exp(start_) / np.sum(np.exp(start_))
+            end_ = np.exp(end_) / np.sum(np.exp(end_))
+
+            # Mask padding and question
+            start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1)
+
+            # TODO : What happens if not possible
+            # Mask CLS
+            start_[0] = end_[0] = 0
+
+            starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len'])
+            char_to_word = np.array(example.char_to_word_offset)
+
+            # Convert the answer (tokens) back to the original text
+            answers += [
+                {
+                    'score': score.item(),
+                    'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
+                    'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
+                    'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]:feature.token_to_orig_map[e] + 1])
+                }
+                for s, e, score in zip(starts, ends, scores)
+            ]
+        if len(answers) == 1:
+            return answers[0]
+        return answers
+
+    def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
+        """
+        Take the output of any QuestionAnswering head and will generate probalities for each span to be
+        the actual answer.
+        In addition, it filters out some unwanted/impossible cases like answer len being greater than
+        max_answer_len or answer end position being before the starting position.
+        The method supports output the k-best answer through the topk argument.
+
+        Args:
+            start: numpy array, holding individual start probabilities for each token
+            end: numpy array, holding individual end probabilities for each token
+            topk: int, indicates how many possible answer span(s) to extract from the model's output
+            max_answer_len: int, maximum size of the answer to extract from the model's output
+        """
+        # Ensure we have batch axis
+        if start.ndim == 1:
+            start = start[None]
+
+        if end.ndim == 1:
+            end = end[None]
+
+        # Compute the score of each tuple(start, end) to be the real answer
+        outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
+
+        # Remove candidate with end < start and end - start > max_answer_len
+        candidates = np.tril(np.triu(outer), max_answer_len - 1)
+
+        #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
+        scores_flat = candidates.flatten()
+        if topk == 1:
+            idx_sort = [np.argmax(scores_flat)]
+        elif len(scores_flat) < topk:
+            idx_sort = np.argsort(-scores_flat)
+        else:
+            idx = np.argpartition(-scores_flat, topk)[0:topk]
+            idx_sort = idx[np.argsort(-scores_flat[idx])]
+
+        start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
+        return start, end, candidates[0, start, end]
+
+    def span_to_answer(self, text: str, start: int, end: int):
+        """
+        When decoding from token probalities, this method maps token indexes to actual word in
+        the initial context.
+
+        Args:
+            text: str, the actual context to extract the answer from
+            start: int, starting answer token index
+            end: int, ending answer token index
+
+        Returns:
+            dict: {'answer': str, 'start': int, 'end': int}
+        """
+        words = []
+        token_idx = char_start_idx = char_end_idx = chars_idx = 0
+
+        for i, word in enumerate(text.split(" ")):
+            token = self.tokenizer.tokenize(word)
+
+            # Append words if they are in the span
+            if start <= token_idx <= end:
+                if token_idx == start:
+                    char_start_idx = chars_idx
+
+                if token_idx == end:
+                    char_end_idx = chars_idx + len(word)
+
+                words += [word]
+
+            # Stop if we went over the end of the answer
+            if token_idx > end:
+                break
+
+            # Append the subtokenization length to the running index
+            token_idx += len(token)
+            chars_idx += len(word) + 1
+
+        # Join text with spaces
+        return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)}
+
+
+# Register all the supported task here
+SUPPORTED_TASKS = {
+    'feature-extraction': {
+        'impl': FeatureExtractionPipeline,
+        'tf': TFAutoModel if is_tf_available() else None,
+        'pt': AutoModel if is_torch_available() else None,
+        'default': {
+            'model': {
+                'pt': 'distilbert-base-uncased',
+                'tf': 'distilbert-base-uncased',
+            },
+            'config': None,
+            'tokenizer': 'distilbert-base-uncased'
+        }
+    },
+    'sentiment-analysis': {
+        'impl': TextClassificationPipeline,
+        'tf': TFAutoModelForSequenceClassification if is_tf_available() else None,
+        'pt': AutoModelForSequenceClassification if is_torch_available() else None,
+        'default': {
+            'model': {
+                'pt': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin',
+                'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5',
+            },
+            'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json',
+            'tokenizer': 'distilbert-base-uncased'
+        }
+    },
+    'ner': {
+        'impl': NerPipeline,
+        'tf': TFAutoModelForTokenClassification if is_tf_available() else None,
+        'pt': AutoModelForTokenClassification if is_torch_available() else None,
+        'default': {
+            'model': {
+                'pt':'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin',
+                'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5',
+            },
+            'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json',
+            'tokenizer': 'bert-large-cased'
+        }
+    },
+    'question-answering': {
+        'impl': QuestionAnsweringPipeline,
+        'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None,
+        'pt': AutoModelForQuestionAnswering if is_torch_available() else None,
+        'default': {
+            'model': {
+                'pt': 'distilbert-base-uncased-distilled-squad',
+                'tf': 'distilbert-base-uncased-distilled-squad',
+            },
+            'config': None,
+            'tokenizer': 'distilbert-base-uncased'
+        }
+    }
+}
+
+
+def pipeline(task: str, model: Optional = None,
+             config: Optional[Union[str, PretrainedConfig]] = None,
+             tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+             modelcard: Optional[Union[str, ModelCard]] = None,
+             **kwargs) -> Pipeline:
+    """
+    Utility factory method to build a pipeline.
+    Pipeline are made of:
+        A Tokenizer instance in charge of mapping raw textual input to token
+        A Model instance
+        Some (optional) post processing for enhancing model's output
+
+    Examples:
+        pipeline('sentiment-analysis')
+        pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='bert-base-cased')
+        pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...)
+        pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased')
+    """
+    # Retrieve the task
+    if task not in SUPPORTED_TASKS:
+        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))
+
+    framework = get_framework(model)
+
+    targeted_task = SUPPORTED_TASKS[task]
+    task, model_class = targeted_task['impl'], targeted_task[framework]
+
+    # Use default model/config/tokenizer for the task if no model is provided
+    if model is None:
+        models, config, tokenizer = tuple(targeted_task['default'].values())
+        model = models[framework]
+
+    # Try to infer tokenizer from model or config name (if provided as str)
+    if tokenizer is None:
+        if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+            tokenizer = model
+        elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+            tokenizer = config
+        else:
+            # Impossible to guest what is the right tokenizer here
+            raise Exception("Impossible to guess which tokenizer to use. "
+                            "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer.")
+
+    # Try to infer modelcard from model or config name (if provided as str)
+    if modelcard is None:
+        # Try to fallback on one of the provided string for model or config (will replace the suffix)
+        if isinstance(model, str):
+            modelcard = model
+        elif isinstance(config, str):
+            modelcard = config
+
+    # Instantiate tokenizer if needed
+    if isinstance(tokenizer, six.string_types):
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer)
+
+    # Instantiate config if needed
+    if isinstance(config, str):
+        config = AutoConfig.from_pretrained(config)
+
+    # Instantiate modelcard if needed
+    if isinstance(modelcard, str):
+        modelcard = ModelCard.from_pretrained(modelcard)
+
+    # Instantiate model if needed
+    if isinstance(model, str):
+        # Handle transparent TF/PT model conversion
+        model_kwargs = {}
+        if framework == 'pt' and model.endswith('.h5'):
+            model_kwargs['from_tf'] = True
+            logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. '
+                           'Trying to load the model with PyTorch.')
+        elif framework == 'tf' and model.endswith('.bin'):
+            model_kwargs['from_pt'] = True
+            logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. '
+                           'Trying to load the model with Tensorflow.')
+        model = model_class.from_pretrained(model, config=config, **model_kwargs)
+
+    return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs)
--- a/transformers/tests/configuration_common_test.py
+++ b/transformers/tests/configuration_common_test.py
@@ -16,15 +16,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import copy
 import os
-import shutil
 import json
-import random
-import uuid
+import tempfile

 import unittest
-import logging
+from .tokenization_tests_commons import TemporaryDirectory


 class ConfigTester(object):
@@ -48,16 +45,28 @@ class ConfigTester(object):

    def create_and_test_config_to_json_file(self):
        config_first = self.config_class(**self.inputs_dict)
-        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
-        config_first.to_json_file(json_file_path)
-        config_second = self.config_class.from_json_file(json_file_path)
-        os.remove(json_file_path)
+
+        with TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "config.json")
+            config_first.to_json_file(json_file_path)
+            config_second = self.config_class.from_json_file(json_file_path)
+
+        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    def create_and_test_config_from_and_save_pretrained(self):
+        config_first = self.config_class(**self.inputs_dict)
+
+        with TemporaryDirectory() as tmpdirname:
+            config_first.save_pretrained(tmpdirname)
+            config_second = self.config_class.from_pretrained(tmpdirname)
+
        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())

    def run_common_tests(self):
        self.create_and_test_config_common_properties()
        self.create_and_test_config_to_json_string()
        self.create_and_test_config_to_json_file()
+        self.create_and_test_config_from_and_save_pretrained()

 if __name__ == "__main__":
    unittest.main()
\ No newline at end of file
--- a/transformers/tests/fixtures/empty.txt
+++ b/transformers/tests/fixtures/empty.txt
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
@@ -15,18 +15,30 @@
 from __future__ import absolute_import, division, print_function

 import os
-import six
 import time
 import unittest

-from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
+import requests
+import six
+
+from transformers.hf_api import HfApi, HfFolder, HTTPError, PresignedUrl, S3Obj

 USER = "__DUMMY_TRANSFORMERS_USER__"
 PASS = "__DUMMY_TRANSFORMERS_PASS__"
-FILE_KEY = "Test-{}.txt".format(int(time.time()))
-FILE_PATH = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
-)
+FILES = [
+    (
+        "Test-{}.txt".format(int(time.time())),
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
+        )
+    ),
+    (
+        "yoyo {}.txt".format(int(time.time())), # space is intentional
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"
+        )
+    ),
+]



@@ -57,15 +69,21 @@ class HfApiEndpointsTest(HfApiCommonTest):
        self.assertEqual(user, USER)

    def test_presign(self):
-        urls = self._api.presign(token=self._token, filename=FILE_KEY)
-        self.assertIsInstance(urls, PresignedUrl)
-        self.assertEqual(urls.type, "text/plain")
+        for FILE_KEY, FILE_PATH in FILES:
+            urls = self._api.presign(token=self._token, filename=FILE_KEY)
+            self.assertIsInstance(urls, PresignedUrl)
+            self.assertEqual(urls.type, "text/plain")

    def test_presign_and_upload(self):
-        access_url = self._api.presign_and_upload(
-            token=self._token, filename=FILE_KEY, filepath=FILE_PATH
-        )
-        self.assertIsInstance(access_url, six.string_types)
+        for FILE_KEY, FILE_PATH in FILES:
+            access_url = self._api.presign_and_upload(
+                token=self._token, filename=FILE_KEY, filepath=FILE_PATH
+            )
+            self.assertIsInstance(access_url, six.string_types)
+            with open(FILE_PATH, 'r') as f:
+                body = f.read()
+            r = requests.get(access_url)
+            self.assertEqual(r.text, body)

    def test_list_objs(self):
        objs = self._api.list_objs(token=self._token)

--- a/transformers/tests/model_card_test.py
+++ b/transformers/tests/model_card_test.py
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import json
+import unittest
+
+from transformers.modelcard import ModelCard
+from .tokenization_tests_commons import TemporaryDirectory
+
+class ModelCardTester(unittest.TestCase):
+
+    def setUp(self):
+        self.inputs_dict = {'model_details': {
+                                'Organization': 'testing',
+                                'Model date': 'today',
+                                'Model version': 'v2.1, Developed by Test Corp in 2019.',
+                                'Architecture': 'Convolutional Neural Network.',
+                                },
+                            'metrics': 'BLEU and ROUGE-1',
+                            'evaluation_data':{
+                                'Datasets':{
+                                    'BLEU': 'My-great-dataset-v1',
+                                    'ROUGE-1': 'My-short-dataset-v2.1',
+                                },
+                                'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf'
+                            },
+                            'training_data':{
+                                'Dataset': 'English Wikipedia dump dated 2018-12-01',
+                                'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf'
+                            },
+                            'quantitative_analyses': {
+                                'BLEU': 55.1,
+                                'ROUGE-1': 76,
+                            },
+                            }
+
+    def test_model_card_common_properties(self):
+        modelcard = ModelCard.from_dict(self.inputs_dict)
+        self.assertTrue(hasattr(modelcard, 'model_details'))
+        self.assertTrue(hasattr(modelcard, 'intended_use'))
+        self.assertTrue(hasattr(modelcard, 'factors'))
+        self.assertTrue(hasattr(modelcard, 'metrics'))
+        self.assertTrue(hasattr(modelcard, 'evaluation_data'))
+        self.assertTrue(hasattr(modelcard, 'training_data'))
+        self.assertTrue(hasattr(modelcard, 'quantitative_analyses'))
+        self.assertTrue(hasattr(modelcard, 'ethical_considerations'))
+        self.assertTrue(hasattr(modelcard, 'caveats_and_recommendations'))
+
+    def test_model_card_to_json_string(self):
+        modelcard = ModelCard.from_dict(self.inputs_dict)
+        obj = json.loads(modelcard.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.assertEqual(obj[key], value)
+
+    def test_model_card_to_json_file(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+
+        with TemporaryDirectory() as tmpdirname:
+            filename = os.path.join(tmpdirname, u"modelcard.json")
+            model_card_first.to_json_file(filename)
+            model_card_second = ModelCard.from_json_file(filename)
+
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
+
+    def test_model_card_from_and_save_pretrained(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+
+        with TemporaryDirectory() as tmpdirname:
+            model_card_first.save_pretrained(tmpdirname)
+            model_card_second = ModelCard.from_pretrained(tmpdirname)
+
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
+
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -17,13 +17,12 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import shutil

 from transformers import is_torch_available

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device

 if is_torch_available():
    from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
@@ -110,7 +109,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = AlbertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -230,10 +229,8 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):

    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = AlbertModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = AlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)

 if __name__ == "__main__":

--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -22,7 +22,7 @@ import logging

 from transformers import is_torch_available

-from .utils import require_torch, slow
+from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER

 if is_torch_available():
    from transformers import (AutoConfig, BertConfig,
@@ -92,6 +92,11 @@ class AutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForQuestionAnswering)

+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, BertForMaskedLM)
+

 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -17,13 +17,12 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import shutil

 from transformers import is_torch_available

 from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device

 if is_torch_available():
    from transformers import (BertConfig, BertModel, BertForMaskedLM,
@@ -109,7 +108,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -360,10 +359,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):

    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = BertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)



--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -18,7 +18,7 @@ from __future__ import print_function

 import copy
 import sys
-import os
+import os.path
 import shutil
 import tempfile
 import json
@@ -30,7 +30,7 @@ import logging

 from transformers import is_torch_available

-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device

 if is_torch_available():
    import torch
@@ -58,7 +58,7 @@ else:
 def _config_zero_init(config):
    configs_no_init = copy.deepcopy(config)
    for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key:
+        if '_range' in key or '_std' in key or 'initializer_factor' in key:
            setattr(configs_no_init, key, 0.0)
    return configs_no_init

@@ -73,6 +73,7 @@ class CommonTestCases:
        test_pruning = True
        test_resize_embeddings = True
        test_head_masking = True
+        is_encoder_decoder = False

        def test_save_load(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -83,6 +84,8 @@ class CommonTestCases:
                model.eval()
                with torch.no_grad():
                    outputs = model(**inputs_dict)
+                out_2 = outputs[0].numpy()
+                out_2[np.isnan(out_2)] = 0

                with TemporaryDirectory() as tmpdirname:
                    model.save_pretrained(tmpdirname)
@@ -93,9 +96,7 @@ class CommonTestCases:

                    # Make sure we don't have nans
                    out_1 = after_outputs[0].cpu().numpy()
-                    out_2 = outputs[0].cpu().numpy()
-                    out_1 = out_1[~np.isnan(out_1)]
-                    out_2 = out_2[~np.isnan(out_2)]
+                    out_1[np.isnan(out_1)] = 0
                    max_diff = np.amax(np.abs(out_1 - out_2))
                    self.assertLessEqual(max_diff, 1e-5)

@@ -117,20 +118,32 @@ class CommonTestCases:
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
-                self.assertEqual(first.ne(second).sum().item(), 0)
-
+                with torch.no_grad():
+                    first = model(**inputs_dict)[0]
+                    second = model(**inputs_dict)[0]
+                out_1 = first.cpu().numpy()
+                out_2 = second.cpu().numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)

        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+
            for model_class in self.all_model_classes:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, False)
@@ -138,28 +151,42 @@ class CommonTestCases:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_seq_length ,
+                    encoder_key_length])
                out_len = len(outputs)

+                if self.is_encoder_decoder:
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(decoder_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                         decoder_seq_length,
+                         decoder_key_length
+                         ])
+
                # Check attention is always last and order is fine
                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
-                self.assertEqual(out_len+1, len(outputs))
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, True)

-                attentions = outputs[-1]
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self_attentions = outputs[-1]
+                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
+                    list(self_attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_seq_length,
+                    encoder_key_length])

        def test_torchscript(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -191,21 +218,22 @@ class CommonTestCases:
                inputs = inputs_dict['input_ids']  # Let's keep only input_ids

                try:
-                    torch.jit.trace(model, inputs)
+                    traced_gpt2 = torch.jit.trace(model, inputs)
                except RuntimeError:
                    self.fail("Couldn't trace module.")

-                try:
-                    traced_gpt2 = torch.jit.trace(model, inputs)
-                    torch.jit.save(traced_gpt2, "traced_model.pt")
-                except RuntimeError:
-                    self.fail("Couldn't save module.")
+                with TemporaryDirectory() as tmp_dir_name:
+                    pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")

-                try:
-                    loaded_model = torch.jit.load("traced_model.pt")
-                    os.remove("traced_model.pt")
-                except ValueError:
-                    self.fail("Couldn't load module.")
+                    try:
+                        torch.jit.save(traced_gpt2, pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't save module.")
+
+                    try:
+                        loaded_model = torch.jit.load(pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't load module.")

                model.to(torch_device)
                model.eval()
@@ -223,7 +251,6 @@ class CommonTestCases:

                self.assertTrue(models_equal)

-
        def test_headmasking(self):
            if not self.test_head_masking:
                return
@@ -278,7 +305,6 @@ class CommonTestCases:
                self.assertNotEqual(
                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)

-
        def test_head_pruning(self):
            if not self.test_pruning:
                return
@@ -297,7 +323,8 @@ class CommonTestCases:
                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
                model.prune_heads(heads_to_prune)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)

                attentions = outputs[-1]

@@ -326,20 +353,19 @@ class CommonTestCases:
                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
                model.prune_heads(heads_to_prune)
-                directory = "pruned_model"
-                if not os.path.exists(directory):
-                    os.makedirs(directory)
-                model.save_pretrained(directory)
-                model = model_class.from_pretrained(directory)
-                model.to(torch_device)

-                outputs = model(**inputs_dict)
+                with TemporaryDirectory() as temp_dir_name:
+                    model.save_pretrained(temp_dir_name)
+                    model = model_class.from_pretrained(temp_dir_name)
+                    model.to(torch_device)
+
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], 1)
                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
                self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)

-                shutil.rmtree(directory)

        def test_head_pruning_save_load_from_config_init(self):
            if not self.test_pruning:
@@ -362,7 +388,8 @@ class CommonTestCases:
                model.to(torch_device)
                model.eval()

-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]

                self.assertEqual(attentions[0].shape[-3], 1)
@@ -389,7 +416,8 @@ class CommonTestCases:
                model.to(torch_device)
                model.eval()

-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]

                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -397,16 +425,13 @@ class CommonTestCases:
                self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
                self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)

-                directory = "pruned_model"
-
-                if not os.path.exists(directory):
-                    os.makedirs(directory)
-                model.save_pretrained(directory)
-                model = model_class.from_pretrained(directory)
-                model.to(torch_device)
-                shutil.rmtree(directory)
+                with TemporaryDirectory() as temp_dir_name:
+                    model.save_pretrained(temp_dir_name)
+                    model = model_class.from_pretrained(temp_dir_name)
+                    model.to(torch_device)

-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]

                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -417,7 +442,8 @@ class CommonTestCases:
                heads_to_prune = {0: [0], 2: [1, 2]}
                model.prune_heads(heads_to_prune)

-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]

                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
@@ -427,7 +453,6 @@ class CommonTestCases:

                self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})

-
        def test_hidden_states_output(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

@@ -437,14 +462,16 @@ class CommonTestCases:
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                hidden_states = outputs[-1]
                self.assertEqual(model.config.output_attentions, False)
                self.assertEqual(model.config.output_hidden_states, True)
                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                self.assertListEqual(
                    list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+                    [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
+                     self.model_tester.hidden_size])

        def test_resize_tokens_embeddings(self):
            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -550,8 +577,14 @@ class CommonTestCases:

        def test_inputs_embeds(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
-            del inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]

            for model_class in self.all_model_classes:
                model = model_class(config)
@@ -559,9 +592,14 @@ class CommonTestCases:
                model.eval()

                wte = model.get_input_embeddings()
-                inputs_dict["inputs_embeds"] = wte(input_ids)
-                outputs = model(**inputs_dict)
+                if not self.is_encoder_decoder:
+                    inputs_dict["inputs_embeds"] = wte(input_ids)
+                else:
+                    inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
+                    inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)

+                with torch.no_grad():
+                    outputs = model(**inputs_dict)

    class GPTModelTester(CommonModelTester):

@@ -633,7 +671,7 @@ class CommonTestCases:
                mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)

            config = self.config_class(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_positions=self.n_positions,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
@@ -649,9 +687,10 @@ class CommonTestCases:
            model.to(torch_device)
            model.eval()

-            outputs = model(input_ids, position_ids, token_type_ids)
-            outputs = model(input_ids, position_ids)
-            outputs = model(input_ids)
+            with torch.no_grad():
+                outputs = model(input_ids, position_ids, token_type_ids)
+                outputs = model(input_ids, position_ids)
+                outputs = model(input_ids)

            hidden_state = outputs[0]
            self.parent.assertListEqual(
@@ -664,7 +703,8 @@ class CommonTestCases:
            model = self.lm_head_model_class(config)
            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
+            with torch.no_grad():
+                outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
            loss, lm_logits = outputs[:2]

            total_voc = self.vocab_size
@@ -681,7 +721,8 @@ class CommonTestCases:
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(input_ids)
+                with torch.no_grad():
+                    outputs = model(input_ids)
                presents = outputs[-1]
                self.parent.assertEqual(self.num_hidden_layers, len(presents))
                self.parent.assertListEqual(
@@ -694,7 +735,8 @@ class CommonTestCases:
            model = self.double_head_model_class(config)
            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
+            with torch.no_grad():
+                outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                            token_type_ids=token_type_ids, position_ids=position_ids)
            lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
            loss = [lm_loss, mc_loss]
@@ -711,10 +753,8 @@ class CommonTestCases:
                [[], []])

        def create_and_check_model_from_pretrained(self):
-            cache_dir = "/tmp/transformers_test/"
            for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
-                model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
-                shutil.rmtree(cache_dir)
+                model = self.base_model_class.from_pretrained(model_name, cache_dir=CACHE_DIR)
                self.parent.assertIsNotNone(model)

        def prepare_config_and_inputs_for_common(self):

--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -16,7 +16,6 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import shutil
 import pdb

 from transformers import is_torch_available
@@ -27,7 +26,7 @@ if is_torch_available():

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device


 @require_torch
@@ -114,7 +113,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -205,10 +204,8 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):

    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = CTRLModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = CTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)



--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -27,7 +27,7 @@ if is_torch_available():

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device


 @require_torch
@@ -105,7 +105,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                dim=self.hidden_size,
                n_layers=self.num_hidden_layers,
                n_heads=self.num_attention_heads,
@@ -235,10 +235,8 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):

    # @slow
    # def test_model_from_pretrained(self):
-    #     cache_dir = "/tmp/transformers_test/"
    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
-    #         shutil.rmtree(cache_dir)
+    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
    #         self.assertIsNotNone(model)

 if __name__ == "__main__":

--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import shutil

 from transformers import is_torch_available

@@ -27,7 +26,7 @@ if is_torch_available():

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device


 @require_torch
@@ -110,7 +109,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -239,10 +238,8 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):

    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = GPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)



--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import shutil

 from transformers import is_torch_available

@@ -27,7 +26,7 @@ if is_torch_available():

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device


 @require_torch
@@ -98,7 +97,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,
@@ -207,10 +206,8 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):

    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)



--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -17,7 +17,6 @@ from __future__ import division
 from __future__ import print_function

 import unittest
-import shutil

 from transformers import is_torch_available

@@ -25,11 +24,12 @@ if is_torch_available():
    import torch
    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
                              RobertaForSequenceClassification, RobertaForTokenClassification)
+    from transformers.modeling_roberta import RobertaEmbeddings
    from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP

 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
-from .utils import require_torch, slow, torch_device
+from .utils import CACHE_DIR, require_torch, slow, torch_device


 @require_torch
@@ -106,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)

            config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,
@@ -199,12 +199,61 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):

    @slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/transformers_test/"
        for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
+            model = RobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
            self.assertIsNotNone(model)

+    def test_create_position_ids_respects_padding_index(self):
+        """ Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        model = RobertaEmbeddings(config=config)
+
+        input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
+        expected_positions = torch.as_tensor([[
+            0 + model.padding_idx + 1,
+            1 + model.padding_idx + 1,
+            2 + model.padding_idx + 1,
+            model.padding_idx
+        ]])
+
+        position_ids = model.create_position_ids_from_input_ids(input_ids)
+        self.assertEqual(
+            position_ids.shape,
+            expected_positions.shape
+        )
+        self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
+
+    def test_create_position_ids_from_inputs_embeds(self):
+        """ Ensure that the default position ids only assign a sequential . This is a regression
+        test for https://github.com/huggingface/transformers/issues/1761
+
+        The position ids should be masked with the embedding object's padding index. Therefore, the
+        first available non-padding position index is RobertaEmbeddings.padding_idx + 1
+        """
+        config = self.model_tester.prepare_config_and_inputs()[0]
+        embeddings = RobertaEmbeddings(config=config)
+
+        inputs_embeds = torch.Tensor(2, 4, 30)
+        expected_single_positions = [
+            0 + embeddings.padding_idx + 1,
+            1 + embeddings.padding_idx + 1,
+            2 + embeddings.padding_idx + 1,
+            3 + embeddings.padding_idx + 1,
+        ]
+        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
+        self.assertEqual(
+            position_ids.shape,
+            expected_positions.shape
+        )
+        self.assertTrue(
+            torch.all(torch.eq(position_ids, expected_positions))
+        )


 class RobertaModelIntegrationTest(unittest.TestCase):