Merge branch 'master' into cli

c7be096c · thomwolf · 3492a6ec · 33adab2b · c7be096c · c7be096c
Commit c7be096c authored Dec 19, 2019 by thomwolf
20 changed files
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -26,13 +26,12 @@ from tensorflow.python.keras.saving import hdf5_format
 import h5py
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
+                         cached_path, hf_bucket_url, is_remote_url)
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 logger = logging.getLogger(__name__)
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
 class TFPreTrainedModel(tf.keras.Model):
    r""" Base class for all TF models.
@@ -61,7 +60,7 @@ class TFPreTrainedModel(tf.keras.Model):
        Returns:
            tf.Tensor with dummy inputs
        """
-        return tf.constant(DUMMY_INPUTS)
+        return {'input_ids': tf.constant(DUMMY_INPUTS)}
    def __init__(self, config, *inputs, **kwargs):
        super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
@@ -178,6 +177,7 @@ class TFPreTrainedModel(tf.keras.Model):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
@@ -263,12 +263,14 @@ class TFPreTrainedModel(tf.keras.Model):
                    raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME],
                        pretrained_model_name_or_path))
-            elif os.path.isfile(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                archive_file = pretrained_model_name_or_path
            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
                archive_file = pretrained_model_name_or_path + ".index"
            else:
-                archive_file = pretrained_model_name_or_path
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME)
+                if from_pt:
+                    raise EnvironmentError("Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name.")
            # redirect to the cache, if necessary
            try:
@@ -301,7 +303,7 @@ class TFPreTrainedModel(tf.keras.Model):
        if from_pt:
            # Load from a PyTorch checkpoint
-            return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file)
+            return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
        ret = model(model.dummy_inputs, training=False)  # build the network with dummy inputs

--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -460,7 +460,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
            langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
        else:
            langs_list = None
-        return [inputs_list, attns_list, langs_list]
+        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
 XLM_START_DOCSTRING = r"""    The XLM model was proposed in

--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -366,7 +366,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
        self.use_bfloat16 = config.use_bfloat16
        self.initializer_range = config.initializer_range
-        self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
+        self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
        self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
        self.dropout = tf.keras.layers.Dropout(config.dropout)

--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -592,14 +592,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
-        self.n_token = config.n_token
+        self.n_token = config.vocab_size
        self.d_embed = config.d_embed
        self.d_model = config.d_model
        self.n_head = config.n_head
        self.d_head = config.d_head
-        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
+        self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
                                          div_val=config.div_val)
        self.drop = nn.Dropout(config.dropout)
@@ -836,11 +836,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
        self.sample_softmax = config.sample_softmax
        # use sampled softmax
        if config.sample_softmax > 0:
-            self.out_layer = nn.Linear(config.d_model, config.n_token)
+            self.out_layer = nn.Linear(config.d_model, config.vocab_size)
-            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
+            self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
        # use adaptive softmax (including standard softmax)
        else:
-            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
+            self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model,
                                                    config.cutoffs, div_val=config.div_val)
        self.init_weights()

--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -31,11 +31,11 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, DUMMY_INPUTS,
+                         cached_path, hf_bucket_url, is_remote_url)
 logger = logging.getLogger(__name__)
 try:
    from torch.nn import Identity
 except ImportError:
@@ -71,6 +71,15 @@ class PreTrainedModel(nn.Module):
    load_tf_weights = lambda model, config, path: None
    base_model_prefix = ""
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to do a forward pass in the network.
+        Returns:
+            torch.Tensor with dummy inputs
+        """
+        return {'input_ids': torch.tensor(DUMMY_INPUTS)}
    def __init__(self, config, *inputs, **kwargs):
        super(PreTrainedModel, self).__init__()
        if not isinstance(config, PretrainedConfig):
@@ -160,8 +169,7 @@ class PreTrainedModel(nn.Module):
        base_model.vocab_size = new_num_tokens
        # Tie weights again if needed
-        if hasattr(self, 'tie_weights'):
+        self.tie_weights()
-            self.tie_weights()
        return model_embeds
@@ -265,6 +273,7 @@ class PreTrainedModel(nn.Module):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
@@ -318,10 +327,6 @@ class PreTrainedModel(nn.Module):
            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
        """
-        if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
-            logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
-                           "https://github.com/google-research/google-research/issues/119 for more information.")
        config = kwargs.pop('config', None)
        state_dict = kwargs.pop('state_dict', None)
        cache_dir = kwargs.pop('cache_dir', None)
@@ -362,14 +367,16 @@ class PreTrainedModel(nn.Module):
                    raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
                        pretrained_model_name_or_path))
-            elif os.path.isfile(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                archive_file = pretrained_model_name_or_path
            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
                    pretrained_model_name_or_path + ".index")
                archive_file = pretrained_model_name_or_path + ".index"
            else:
-                archive_file = pretrained_model_name_or_path
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
+                if from_tf:
+                    raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")
            # redirect to the cache, if necessary
            try:
@@ -473,8 +480,7 @@ class PreTrainedModel(nn.Module):
                raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                model.__class__.__name__, "\n\t".join(error_msgs)))
-        if hasattr(model, 'tie_weights'):
+        model.tie_weights()  # make sure word embedding weights are still tied if needed
-            model.tie_weights()  # make sure word embedding weights are still tied
        # Set model in evaluation mode to desactivate DropOut modules by default
        model.eval()

--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel):
    def __init__(self, *inputs, **kwargs):
        super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
+    @property
+    def dummy_inputs(self):
+        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        else:
+            langs_list = None
+        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
    def _init_weights(self, module):
        """ Initialize the weights. """
        if isinstance(module, nn.Embedding):
@@ -646,7 +656,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
                                               langs=langs,
                                               token_type_ids=token_type_ids,
                                               position_ids=position_ids,
-                                               lengths=lengths, 
+                                               lengths=lengths,
                                               cache=cache,
                                               head_mask=head_mask,
                                               inputs_embeds=inputs_embeds)

--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -609,7 +609,7 @@ class XLNetModel(XLNetPreTrainedModel):
        self.clamp_len = config.clamp_len
        self.n_layer = config.n_layer
-        self.word_embedding = nn.Embedding(config.n_token, config.d_model)
+        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
        self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
        self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
        self.dropout = nn.Dropout(config.dropout)
@@ -940,7 +940,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        self.same_length = config.same_length
        self.transformer = XLNetModel(config)
-        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
+        self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
        self.init_weights()

--- a/transformers/tests/configuration_common_test.py
+++ b/transformers/tests/configuration_common_test.py
@@ -16,15 +16,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import copy
 import os
-import shutil
 import json
-import random
+import tempfile
-import uuid
 import unittest
-import logging
+from .tokenization_tests_commons import TemporaryDirectory
 class ConfigTester(object):
@@ -48,16 +45,28 @@ class ConfigTester(object):
    def create_and_test_config_to_json_file(self):
        config_first = self.config_class(**self.inputs_dict)
-        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
-        config_first.to_json_file(json_file_path)
+        with TemporaryDirectory() as tmpdirname:
-        config_second = self.config_class.from_json_file(json_file_path)
+            json_file_path = os.path.join(tmpdirname, "config.json")
-        os.remove(json_file_path)
+            config_first.to_json_file(json_file_path)
+            config_second = self.config_class.from_json_file(json_file_path)
+        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+    def create_and_test_config_from_and_save_pretrained(self):
+        config_first = self.config_class(**self.inputs_dict)
+        with TemporaryDirectory() as tmpdirname:
+            config_first.save_pretrained(tmpdirname)
+            config_second = self.config_class.from_pretrained(tmpdirname)
        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
    def run_common_tests(self):
        self.create_and_test_config_common_properties()
        self.create_and_test_config_to_json_string()
        self.create_and_test_config_to_json_file()
+        self.create_and_test_config_from_and_save_pretrained()
 if __name__ == "__main__":
    unittest.main()
\ No newline at end of file
--- a/transformers/tests/fixtures/empty.txt
+++ b/transformers/tests/fixtures/empty.txt
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
@@ -15,18 +15,30 @@
 from __future__ import absolute_import, division, print_function
 import os
-import six
 import time
 import unittest
-from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
+import requests
+import six
+from transformers.hf_api import HfApi, HfFolder, HTTPError, PresignedUrl, S3Obj
 USER = "__DUMMY_TRANSFORMERS_USER__"
 PASS = "__DUMMY_TRANSFORMERS_PASS__"
-FILE_KEY = "Test-{}.txt".format(int(time.time()))
+FILES = [
-FILE_PATH = os.path.join(
+    (
-    os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
+        "Test-{}.txt".format(int(time.time())),
-)
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
+        )
+    ),
+    (
+        "yoyo {}.txt".format(int(time.time())), # space is intentional
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"
+        )
+    ),
+]
@@ -57,15 +69,21 @@ class HfApiEndpointsTest(HfApiCommonTest):
        self.assertEqual(user, USER)
    def test_presign(self):
-        urls = self._api.presign(token=self._token, filename=FILE_KEY)
+        for FILE_KEY, FILE_PATH in FILES:
-        self.assertIsInstance(urls, PresignedUrl)
+            urls = self._api.presign(token=self._token, filename=FILE_KEY)
-        self.assertEqual(urls.type, "text/plain")
+            self.assertIsInstance(urls, PresignedUrl)
+            self.assertEqual(urls.type, "text/plain")
    def test_presign_and_upload(self):
-        access_url = self._api.presign_and_upload(
+        for FILE_KEY, FILE_PATH in FILES:
-            token=self._token, filename=FILE_KEY, filepath=FILE_PATH
+            access_url = self._api.presign_and_upload(
-        )
+                token=self._token, filename=FILE_KEY, filepath=FILE_PATH
-        self.assertIsInstance(access_url, six.string_types)
+            )
+            self.assertIsInstance(access_url, six.string_types)
+            with open(FILE_PATH, 'r') as f:
+                body = f.read()
+            r = requests.get(access_url)
+            self.assertEqual(r.text, body)
    def test_list_objs(self):
        objs = self._api.list_objs(token=self._token)

--- a/transformers/tests/model_card_test.py
+++ b/transformers/tests/model_card_test.py
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import os
+import json
+import unittest
+from transformers.model_card import ModelCard
+from .tokenization_tests_commons import TemporaryDirectory
+class ModelCardTester(unittest.TestCase):
+    def setUp(self):
+        self.inputs_dict = {'model_details': {
+                                'Organization': 'testing',
+                                'Model date': 'today',
+                                'Model version': 'v2.1, Developed by Test Corp in 2019.',
+                                'Architecture': 'Convolutional Neural Network.',
+                                },
+                            'metrics': 'BLEU and ROUGE-1',
+                            'evaluation_data':{
+                                'Datasets':{
+                                    'BLEU': 'My-great-dataset-v1',
+                                    'ROUGE-1': 'My-short-dataset-v2.1',
+                                },
+                                'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf'
+                            },
+                            'training_data':{
+                                'Dataset': 'English Wikipedia dump dated 2018-12-01',
+                                'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf'
+                            },
+                            'quantitative_analyses': {
+                                'BLEU': 55.1,
+                                'ROUGE-1': 76,
+                            },
+                            }
+    def test_model_card_common_properties(self):
+        model_card = ModelCard.from_dict(self.inputs_dict)
+        self.assertTrue(hasattr(model_card, 'model_details'))
+        self.assertTrue(hasattr(model_card, 'intended_use'))
+        self.assertTrue(hasattr(model_card, 'factors'))
+        self.assertTrue(hasattr(model_card, 'metrics'))
+        self.assertTrue(hasattr(model_card, 'evaluation_data'))
+        self.assertTrue(hasattr(model_card, 'training_data'))
+        self.assertTrue(hasattr(model_card, 'quantitative_analyses'))
+        self.assertTrue(hasattr(model_card, 'ethical_considerations'))
+        self.assertTrue(hasattr(model_card, 'caveats_and_recommendations'))
+    def test_model_card_to_json_string(self):
+        model_card = ModelCard.from_dict(self.inputs_dict)
+        obj = json.loads(model_card.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.assertEqual(obj[key], value)
+    def test_model_card_to_json_file(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+        with TemporaryDirectory() as tmpdirname:
+            filename = os.path.join(tmpdirname, u"model_card.json")
+            model_card_first.to_json_file(filename)
+            model_card_second = ModelCard.from_json_file(filename)
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
+    def test_model_card_from_and_save_pretrained(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+        with TemporaryDirectory() as tmpdirname:
+            model_card_first.save_pretrained(tmpdirname)
+            model_card_second = ModelCard.from_pretrained(tmpdirname)
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -110,7 +110,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = AlbertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,

--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -22,7 +22,7 @@ import logging
 from transformers import is_torch_available
-from .utils import require_torch, slow
+from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
 if is_torch_available():
    from transformers import (AutoConfig, BertConfig,
@@ -92,6 +92,11 @@ class AutoModelTest(unittest.TestCase):
            self.assertIsNotNone(model)
            self.assertIsInstance(model, BertForQuestionAnswering)
+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, BertForMaskedLM)
 if __name__ == "__main__":
    unittest.main()
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -109,7 +109,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,

--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -58,7 +58,7 @@ else:
 def _config_zero_init(config):
    configs_no_init = copy.deepcopy(config)
    for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key:
+        if '_range' in key or '_std' in key or 'initializer_factor' in key:
            setattr(configs_no_init, key, 0.0)
    return configs_no_init
@@ -73,6 +73,7 @@ class CommonTestCases:
        test_pruning = True
        test_resize_embeddings = True
        test_head_masking = True
+        is_encoder_decoder = False
        def test_save_load(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -83,6 +84,8 @@ class CommonTestCases:
                model.eval()
                with torch.no_grad():
                    outputs = model(**inputs_dict)
+                out_2 = outputs[0].numpy()
+                out_2[np.isnan(out_2)] = 0
                with TemporaryDirectory() as tmpdirname:
                    model.save_pretrained(tmpdirname)
@@ -93,9 +96,7 @@ class CommonTestCases:
                    # Make sure we don't have nans
                    out_1 = after_outputs[0].cpu().numpy()
-                    out_2 = outputs[0].cpu().numpy()
+                    out_1[np.isnan(out_1)] = 0
-                    out_1 = out_1[~np.isnan(out_1)]
-                    out_2 = out_2[~np.isnan(out_2)]
                    max_diff = np.amax(np.abs(out_1 - out_2))
                    self.assertLessEqual(max_diff, 1e-5)
@@ -117,20 +118,32 @@ class CommonTestCases:
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
+                with torch.no_grad():
-                self.assertEqual(first.ne(second).sum().item(), 0)
+                    first = model(**inputs_dict)[0]
+                    second = model(**inputs_dict)[0]
+                out_1 = first.cpu().numpy()
+                out_2 = second.cpu().numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
            for model_class in self.all_model_classes:
                config.output_attentions = True
                config.output_hidden_states = False
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, False)
@@ -138,28 +151,42 @@ class CommonTestCases:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length ,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
                out_len = len(outputs)
+                if self.is_encoder_decoder:
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(decoder_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                         decoder_seq_length,
+                         decoder_key_length
+                         ])
                # Check attention is always last and order is fine
                config.output_attentions = True
                config.output_hidden_states = True
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
-                self.assertEqual(out_len+1, len(outputs))
+                    outputs = model(**inputs_dict)
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                self.assertEqual(model.config.output_attentions, True)
                self.assertEqual(model.config.output_hidden_states, True)
-                attentions = outputs[-1]
+                self_attentions = outputs[-1]
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
+                    list(self_attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
+                    encoder_seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_key_length])
        def test_torchscript(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -223,7 +250,6 @@ class CommonTestCases:
                self.assertTrue(models_equal)
        def test_headmasking(self):
            if not self.test_head_masking:
                return
@@ -278,7 +304,6 @@ class CommonTestCases:
                self.assertNotEqual(
                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
        def test_head_pruning(self):
            if not self.test_pruning:
                return
@@ -297,7 +322,8 @@ class CommonTestCases:
                heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                -1: [0]}
                model.prune_heads(heads_to_prune)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
@@ -333,7 +359,8 @@ class CommonTestCases:
                model = model_class.from_pretrained(directory)
                model.to(torch_device)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], 1)
                self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
@@ -362,7 +389,8 @@ class CommonTestCases:
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], 1)
@@ -389,7 +417,8 @@ class CommonTestCases:
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -406,7 +435,8 @@ class CommonTestCases:
                model.to(torch_device)
                shutil.rmtree(directory)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -417,7 +447,8 @@ class CommonTestCases:
                heads_to_prune = {0: [0], 2: [1, 2]}
                model.prune_heads(heads_to_prune)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                attentions = outputs[-1]
                self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
@@ -427,7 +458,6 @@ class CommonTestCases:
                self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
        def test_hidden_states_output(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -437,14 +467,16 @@ class CommonTestCases:
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                hidden_states = outputs[-1]
                self.assertEqual(model.config.output_attentions, False)
                self.assertEqual(model.config.output_hidden_states, True)
                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                self.assertListEqual(
                    list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+                    [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
+                     self.model_tester.hidden_size])
        def test_resize_tokens_embeddings(self):
            original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -550,8 +582,14 @@ class CommonTestCases:
        def test_inputs_embeds(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
-            del inputs_dict["input_ids"]
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]
            for model_class in self.all_model_classes:
                model = model_class(config)
@@ -559,9 +597,14 @@ class CommonTestCases:
                model.eval()
                wte = model.get_input_embeddings()
-                inputs_dict["inputs_embeds"] = wte(input_ids)
+                if not self.is_encoder_decoder:
-                outputs = model(**inputs_dict)
+                    inputs_dict["inputs_embeds"] = wte(input_ids)
+                else:
+                    inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
+                    inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
    class GPTModelTester(CommonModelTester):
@@ -633,7 +676,7 @@ class CommonTestCases:
                mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
            config = self.config_class(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_positions=self.n_positions,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
@@ -649,9 +692,10 @@ class CommonTestCases:
            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, position_ids, token_type_ids)
+            with torch.no_grad():
-            outputs = model(input_ids, position_ids)
+                outputs = model(input_ids, position_ids, token_type_ids)
-            outputs = model(input_ids)
+                outputs = model(input_ids, position_ids)
+                outputs = model(input_ids)
            hidden_state = outputs[0]
            self.parent.assertListEqual(
@@ -664,7 +708,8 @@ class CommonTestCases:
            model = self.lm_head_model_class(config)
            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
+            with torch.no_grad():
+                outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
            loss, lm_logits = outputs[:2]
            total_voc = self.vocab_size
@@ -681,7 +726,8 @@ class CommonTestCases:
                model = model_class(config)
                model.to(torch_device)
                model.eval()
-                outputs = model(input_ids)
+                with torch.no_grad():
+                    outputs = model(input_ids)
                presents = outputs[-1]
                self.parent.assertEqual(self.num_hidden_layers, len(presents))
                self.parent.assertListEqual(
@@ -694,7 +740,8 @@ class CommonTestCases:
            model = self.double_head_model_class(config)
            model.to(torch_device)
            model.eval()
-            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
+            with torch.no_grad():
+                outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                            token_type_ids=token_type_ids, position_ids=position_ids)
            lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
            loss = [lm_loss, mc_loss]

--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -114,7 +114,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,

--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -105,7 +105,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                dim=self.hidden_size,
                n_layers=self.num_hidden_layers,
                n_heads=self.num_attention_heads,

--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -110,7 +110,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,

--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -98,7 +98,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                n_embd=self.hidden_size,
                n_layer=self.num_hidden_layers,
                n_head=self.num_attention_heads,

--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -106,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                choice_labels = ids_tensor([self.batch_size], self.num_choices)
            config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                hidden_size=self.hidden_size,
                num_hidden_layers=self.num_hidden_layers,
                num_attention_heads=self.num_attention_heads,