Merge pull request #1203 from huggingface/tf2

[2.0] TF 2.0 support

Merge pull request #1203 from huggingface/tf2
[2.0] TF 2.0 support
17ea43cf · Thomas Wolf · GitHub · 4a233e5b · 80bf868a · 17ea43cf
Unverified Commit 17ea43cf authored Sep 26, 2019 by Thomas Wolf Committed by GitHub Sep 26, 2019
20 changed files
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -31,7 +31,7 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME
+from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
 logger = logging.getLogger(__name__)
@@ -52,16 +52,16 @@ except ImportError:
 class PreTrainedModel(nn.Module):
    r""" Base class for all models.
-        :class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+        :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
        as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
        Class attributes (overridden by derived classes):
-            - ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
+            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
-                - ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
+                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
-                - ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
+                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
                - ``path``: a path (string) to the TensorFlow checkpoint.
            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
@@ -189,7 +189,7 @@ class PreTrainedModel(nn.Module):
    def save_pretrained(self, save_directory):
        """ Save a model and its configuration file to a directory, so that it
-            can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
+            can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
        """
        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
@@ -201,8 +201,8 @@ class PreTrainedModel(nn.Module):
        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
+        logger.info("Model weights saved in {}".format(output_model_file))
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
@@ -220,23 +220,24 @@ class PreTrainedModel(nn.Module):
            pretrained_model_name_or_path: either:
                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+                - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
            model_args: (`optional`) Sequence of positional arguments:
                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
            state_dict: (`optional`) dict:
                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
            cache_dir: (`optional`) string:
                Path to a directory in which a downloaded pre-trained model
@@ -256,7 +257,7 @@ class PreTrainedModel(nn.Module):
                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
        Examples::
@@ -289,103 +290,125 @@ class PreTrainedModel(nn.Module):
            model_kwargs = kwargs
        # Load model
-        if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+        if pretrained_model_name_or_path is not None:
-            archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
+            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-        elif os.path.isdir(pretrained_model_name_or_path):
+                archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
-            if from_tf:
+            elif os.path.isdir(pretrained_model_name_or_path):
-                # Directly load from a TensorFlow checkpoint
+                if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
-                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
+                    # Load from a TF 1.0 checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
+                elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
+                    # Load from a TF 2.0 checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
+                elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
+                    # Load from a PyTorch checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                else:
+                    raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
+                        [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
+                        pretrained_model_name_or_path))
+            elif os.path.isfile(pretrained_model_name_or_path):
+                archive_file = pretrained_model_name_or_path
            else:
-                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
-        else:
-            if from_tf:
-                # Directly load from a TensorFlow checkpoint
                archive_file = pretrained_model_name_or_path + ".index"
+            # redirect to the cache, if necessary
+            try:
+                resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
+            except EnvironmentError as e:
+                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+                    logger.error(
+                        "Couldn't reach server at '{}' to download pretrained weights.".format(
+                            archive_file))
+                else:
+                    logger.error(
+                        "Model name '{}' was not found in model name list ({}). "
+                        "We assumed '{}' was a path or url but couldn't find any file "
+                        "associated to this path or url.".format(
+                            pretrained_model_name_or_path,
+                            ', '.join(cls.pretrained_model_archive_map.keys()),
+                            archive_file))
+                raise e
+            if resolved_archive_file == archive_file:
+                logger.info("loading weights file {}".format(archive_file))
            else:
-                archive_file = pretrained_model_name_or_path
+                logger.info("loading weights file {} from cache at {}".format(
-        # redirect to the cache, if necessary
+                    archive_file, resolved_archive_file))
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
-        except EnvironmentError as e:
-            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find any file "
-                    "associated to this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(cls.pretrained_model_archive_map.keys()),
-                        archive_file))
-            raise e
-        if resolved_archive_file == archive_file:
-            logger.info("loading weights file {}".format(archive_file))
        else:
-            logger.info("loading weights file {} from cache at {}".format(
+            resolved_archive_file = None
-                archive_file, resolved_archive_file))
        # Instantiate model.
        model = cls(config, *model_args, **model_kwargs)
        if state_dict is None and not from_tf:
            state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint
-            return cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
-        # Convert old format to new format if needed from a PyTorch state_dict
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-        # Load from a PyTorch state_dict
        missing_keys = []
        unexpected_keys = []
        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
+        if from_tf:
-        state_dict = state_dict.copy()
+            if resolved_archive_file.endswith('.index'):
-        if metadata is not None:
+                # Load from a TensorFlow 1.X checkpoint - provided by original authors
-            state_dict._metadata = metadata
+                model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
+            else:
-        def load(module, prefix=''):
+                # Load from our TensorFlow 2.0 checkpoints
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+                try:
-            module._load_from_state_dict(
+                    from transformers import load_tf2_checkpoint_in_pytorch_model
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+                    model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
-            for name, child in module._modules.items():
+                except ImportError as e:
-                if child is not None:
+                    logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-                    load(child, prefix + name + '.')
+                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
+                    raise e
-        # Make sure we are able to load base models as well as derived models (with heads)
+        else:
-        start_prefix = ''
+            # Convert old format to new format if needed from a PyTorch state_dict
-        model_to_load = model
+            old_keys = []
-        if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+            new_keys = []
-            start_prefix = cls.base_model_prefix + '.'
+            for key in state_dict.keys():
-        if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+                new_key = None
-            model_to_load = getattr(model, cls.base_model_prefix)
+                if 'gamma' in key:
+                    new_key = key.replace('gamma', 'weight')
-        load(model_to_load, prefix=start_prefix)
+                if 'beta' in key:
-        if len(missing_keys) > 0:
+                    new_key = key.replace('beta', 'bias')
-            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                if new_key:
-                model.__class__.__name__, missing_keys))
+                    old_keys.append(key)
-        if len(unexpected_keys) > 0:
+                    new_keys.append(new_key)
-            logger.info("Weights from pretrained model not used in {}: {}".format(
+            for old_key, new_key in zip(old_keys, new_keys):
-                model.__class__.__name__, unexpected_keys))
+                state_dict[new_key] = state_dict.pop(old_key)
-        if len(error_msgs) > 0:
-            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+            # copy state_dict so _load_from_state_dict can modify it
-                               model.__class__.__name__, "\n\t".join(error_msgs)))
+            metadata = getattr(state_dict, '_metadata', None)
+            state_dict = state_dict.copy()
+            if metadata is not None:
+                state_dict._metadata = metadata
+            def load(module, prefix=''):
+                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+                module._load_from_state_dict(
+                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+                for name, child in module._modules.items():
+                    if child is not None:
+                        load(child, prefix + name + '.')
+            # Make sure we are able to load base models as well as derived models (with heads)
+            start_prefix = ''
+            model_to_load = model
+            if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+                start_prefix = cls.base_model_prefix + '.'
+            if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+                model_to_load = getattr(model, cls.base_model_prefix)
+            load(model_to_load, prefix=start_prefix)
+            if len(missing_keys) > 0:
+                logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                    model.__class__.__name__, missing_keys))
+            if len(unexpected_keys) > 0:
+                logger.info("Weights from pretrained model not used in {}: {}".format(
+                    model.__class__.__name__, unexpected_keys))
+            if len(error_msgs) > 0:
+                raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                                model.__class__.__name__, "\n\t".join(error_msgs)))
        if hasattr(model, 'tie_weights'):
            model.tie_weights()  # make sure word embedding weights are still tied
@@ -531,7 +554,7 @@ class SQuADHead(nn.Module):
    r""" A SQuAD head inspired by XLNet.
    Parameters:
-        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
    Inputs:
        **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
@@ -682,7 +705,7 @@ class SequenceSummary(nn.Module):
            self.last_dropout = nn.Dropout(config.summary_last_dropout)
    def forward(self, hidden_states, cls_index=None):
-        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
+        """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer.
            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
                if summary_type == 'cls_index' and cls_index is None:

--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -63,7 +63,7 @@ def gelu(x):
    GELU activation
    https://arxiv.org/abs/1606.08415
    https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
-    https://github.com/huggingface/pytorch-transformers/blob/master/modeling.py
+    https://github.com/huggingface/transformers/blob/master/modeling.py
    """
    # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
@@ -265,9 +265,9 @@ XLM_START_DOCSTRING = r"""    The XLM model was proposed in
        https://github.com/facebookresearch/XLM
    Parameters:
-        config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 XLM_INPUTS_DOCSTRING = r"""
@@ -278,9 +278,9 @@ XLM_INPUTS_DOCSTRING = r"""
            XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
            the right rather than the left.
-            Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
+            Indices can be obtained using :class:`transformers.XLMTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
@@ -337,11 +337,6 @@ class XLMModel(XLMPreTrainedModel):
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
    """
-    ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
-                  'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads', 
-                  'hidden_dim', 'dropout', 'attention_dropout', 'asm',
-                  'asm_cutoffs', 'asm_div_value']
    def __init__(self, config):  #, dico, is_encoder, with_output):
        super(XLMModel, self).__init__(config)
        self.output_attentions = config.output_attentions
@@ -568,10 +563,10 @@ class XLMPredLayer(nn.Module):
        """
        outputs = ()
        if self.asm is False:
-            scores = self.proj(x).view(-1, self.n_words)
+            scores = self.proj(x)
            outputs = (scores,) + outputs
            if y is not None:
-                loss = F.cross_entropy(scores, y, reduction='elementwise_mean')
+                loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction='elementwise_mean')
                outputs = (loss,) + outputs
        else:
            scores = self.proj.log_prob(x)
@@ -723,6 +718,101 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 @add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). """,
    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+        **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) 
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
+    """
+    def __init__(self, config):
+        super(XLMForQuestionAnsweringSimple, self).__init__(config)
+        self.transformer = XLMModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
+                lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None):
+        transformer_outputs = self.transformer(input_ids,
+                                               attention_mask=attention_mask,
+                                               langs=langs,
+                                               token_type_ids=token_type_ids,
+                                               position_ids=position_ids,
+                                               lengths=lengths, 
+                                               cache=cache,
+                                               head_mask=head_mask)
+        sequence_output = transformer_outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        outputs = (start_logits, end_logits,)
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+        return outputs
+@add_start_docstrings("""XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
 class XLMForQuestionAnswering(XLMPreTrainedModel):
    r"""
        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:

--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -488,9 +488,9 @@ XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
        https://pytorch.org/docs/stable/nn.html#module
    Parameters:
-        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 XLNET_INPUTS_DOCSTRING = r"""
@@ -499,9 +499,9 @@ XLNET_INPUTS_DOCSTRING = r"""
            Indices of input sequence tokens in the vocabulary.
            XLNet is a model with relative position embeddings so you can either pad the inputs on
            the right or on the left.
-            Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
+            Indices can be obtained using :class:`transformers.XLNetTokenizer`.
-            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
            The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
@@ -531,8 +531,10 @@ XLNET_INPUTS_DOCSTRING = r"""
            Only used during pretraining for partial prediction or for sequential decoding (generation).
        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
-            The embeddings from these tokens will be summed with the respective token embeddings.
+            The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
-            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+            the important thing is that they should be different for tokens which belong to different segments.
+            The model will compute relative segment differences from the given type indices:
+            0 if the segment id of two tokens are the same, 1 if not.
        **input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
            Mask to avoid performing attention on padding token indices.
            Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
@@ -1103,6 +1105,101 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
        return outputs  # return (loss), logits, mems, (hidden states), (attentions)
+@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
+class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
+            See details in the docstring of the `mems` input above.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+    Examples::
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
+    """
+    def __init__(self, config):
+        super(XLNetForQuestionAnsweringSimple, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = XLNetModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
+                token_type_ids=None, input_mask=None, head_mask=None,
+                start_positions=None, end_positions=None):
+        outputs = self.transformer(input_ids,
+                                    attention_mask=attention_mask,
+                                    mems=mems,
+                                    perm_mask=perm_mask,
+                                    target_mapping=target_mapping,
+                                    token_type_ids=token_type_ids,
+                                    input_mask=input_mask, 
+                                    head_mask=head_mask)
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        outputs = (start_logits, end_logits,) + outputs[2:]
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
 @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
    the hidden-states output to compute `span start logits` and `span end logits`). """,
    XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)

--- a/pytorch_transformers/optimization.py
+++ b/pytorch_transformers/optimization.py
--- a/pytorch_transformers/tests/__init__.py
+++ b/pytorch_transformers/tests/__init__.py
--- a/pytorch_transformers/tests/configuration_common_test.py
+++ b/pytorch_transformers/tests/configuration_common_test.py
--- a/pytorch_transformers/tests/conftest.py
+++ b/pytorch_transformers/tests/conftest.py
--- a/pytorch_transformers/tests/fixtures/input.txt
+++ b/pytorch_transformers/tests/fixtures/input.txt
--- a/pytorch_transformers/tests/fixtures/sample_text.txt
+++ b/pytorch_transformers/tests/fixtures/sample_text.txt
--- a/pytorch_transformers/tests/fixtures/test_sentencepiece.model
+++ b/pytorch_transformers/tests/fixtures/test_sentencepiece.model
--- a/pytorch_transformers/tests/modeling_auto_test.py
+++ b/pytorch_transformers/tests/modeling_auto_test.py
@@ -21,15 +21,20 @@ import shutil
 import pytest
 import logging
-from pytorch_transformers import (AutoConfig, BertConfig,
+from transformers import is_torch_available
-                                  AutoModel, BertModel,
-                                  AutoModelWithLMHead, BertForMaskedLM,
-                                  AutoModelForSequenceClassification, BertForSequenceClassification,
-                                  AutoModelForQuestionAnswering, BertForQuestionAnswering)
-from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+if is_torch_available():
-from .configuration_common_test import ConfigTester
+    from transformers import (AutoConfig, BertConfig,
+                                    AutoModel, BertModel,
+                                    AutoModelWithLMHead, BertForMaskedLM,
+                                    AutoModelForSequenceClassification, BertForSequenceClassification,
+                                    AutoModelForQuestionAnswering, BertForQuestionAnswering)
+    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from .configuration_common_test import ConfigTester
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 class AutoModelTest(unittest.TestCase):

--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -20,21 +20,26 @@ import unittest
 import shutil
 import pytest
-from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
+from transformers import is_torch_available
-                                     BertForNextSentencePrediction, BertForPreTraining,
-                                     BertForQuestionAnswering, BertForSequenceClassification,
-                                     BertForTokenClassification, BertForMultipleChoice)
-from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+if is_torch_available():
+    from transformers import (BertConfig, BertModel, BertForMaskedLM,
+                                        BertForNextSentencePrediction, BertForPreTraining,
+                                        BertForQuestionAnswering, BertForSequenceClassification,
+                                        BertForTokenClassification, BertForMultipleChoice)
+    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 class BertModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
            BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-            BertForTokenClassification)
+            BertForTokenClassification) if is_torch_available() else ()
    class BertModelTester(object):
@@ -305,7 +310,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
    @pytest.mark.slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)

--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -25,12 +25,18 @@ import uuid
 import unittest
 import logging
+import pytest
-import torch
+from transformers import is_torch_available
-from pytorch_transformers import (PretrainedConfig, PreTrainedModel,
+if is_torch_available():
-                                  BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    import torch
-                                  GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from transformers import (PretrainedConfig, PreTrainedModel,
+                                    BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                    GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 def _config_zero_init(config):
@@ -62,6 +68,16 @@ class CommonTestCases:
                        self.assertIn(param.data.mean().item(), [0.0, 1.0],
                        msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+        def test_determinism(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                model.eval()
+                first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
+                self.assertEqual(first.ne(second).sum().item(), 0)
        def test_attention_outputs(self):
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -195,6 +211,9 @@ class CommonTestCases:
                hidden_states = outputs[-2]
                # Remove Nan
+                for t in attentions:
+                    self.assertLess(torch.sum(torch.isnan(t)), t.numel() / 4)  # Check we don't have more than 25% nans (arbitrary)
+                attentions = [t.masked_fill(torch.isnan(t), 0.0) for t in attentions]  # remove them (the test is less complete)
                self.assertIsNotNone(multihead_outputs)
                self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
@@ -602,7 +621,7 @@ class CommonTestCases:
                [[], []])
        def create_and_check_model_from_pretrained(self):
-            cache_dir = "/tmp/pytorch_transformers_test/"
+            cache_dir = "/tmp/transformers_test/"
            for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
                model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
                shutil.rmtree(cache_dir)

--- a/pytorch_transformers/tests/modeling_distilbert_test.py
+++ b/pytorch_transformers/tests/modeling_distilbert_test.py
@@ -17,9 +17,15 @@ from __future__ import division
 from __future__ import print_function
 import unittest
+import pytest
-from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
+from transformers import is_torch_available
-                                  DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+if is_torch_available():
+    from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
+                                    DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -28,7 +34,7 @@ from .configuration_common_test import ConfigTester
 class DistilBertModelTest(CommonTestCases.CommonModelTester):
    all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
-                         DistilBertForSequenceClassification)
+                         DistilBertForSequenceClassification) if is_torch_available() else None
    test_pruning = True
    test_torchscript = True
    test_resize_embeddings = True
@@ -205,7 +211,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
    # @pytest.mark.slow
    # def test_model_from_pretrained(self):
-    #     cache_dir = "/tmp/pytorch_transformers_test/"
+    #     cache_dir = "/tmp/transformers_test/"
    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
    #         model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
    #         shutil.rmtree(cache_dir)

--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -20,9 +20,13 @@ import unittest
 import pytest
 import shutil
+from transformers import is_torch_available
-from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+if is_torch_available():
-                                  GPT2LMHeadModel, GPT2DoubleHeadsModel)
+    from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                    GPT2LMHeadModel, GPT2DoubleHeadsModel)
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -30,7 +34,7 @@ from .configuration_common_test import ConfigTester
 class GPT2ModelTest(CommonTestCases.CommonModelTester):
-    all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel)
+    all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
    class GPT2ModelTester(object):
@@ -40,7 +44,9 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                     seq_length=7,
                     is_training=True,
                     use_token_type_ids=True,
+                     use_input_mask=True,
                     use_labels=True,
+                     use_mc_token_ids=True,
                     vocab_size=99,
                     hidden_size=32,
                     num_hidden_layers=5,
@@ -62,7 +68,9 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
            self.seq_length = seq_length
            self.is_training = is_training
            self.use_token_type_ids = use_token_type_ids
+            self.use_input_mask = use_input_mask
            self.use_labels = use_labels
+            self.use_mc_token_ids = use_mc_token_ids
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
@@ -82,10 +90,18 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
            token_type_ids = None
            if self.use_token_type_ids:
                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            mc_token_ids = None
+            if self.use_mc_token_ids:
+                mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length)
            sequence_labels = None
            token_labels = None
            choice_labels = None
@@ -111,14 +127,14 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
            head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
-            return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
+            return config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels
        def check_loss_output(self, result):
            self.parent.assertListEqual(
                list(result["loss"].size()),
                [])
-        def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2Model(config=config)
            model.eval()
@@ -135,7 +151,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertEqual(len(result["presents"]), config.n_layer)
-        def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2LMHeadModel(config)
            model.eval()
@@ -153,15 +169,27 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                list(result["lm_logits"].size()),
                [self.batch_size, self.seq_length, self.vocab_size])
-        def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
+        def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
            model = GPT2DoubleHeadsModel(config)
            model.eval()
-            loss, lm_logits, mc_logits, _ = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
+            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            inputs = {'input_ids': multiple_choice_inputs_ids,
+                      'mc_token_ids': mc_token_ids,
+                      'attention_mask': multiple_choice_input_mask,
+                      'token_type_ids': multiple_choice_token_type_ids,
+                      'lm_labels': multiple_choice_inputs_ids}
+            loss, lm_logits, mc_logits, _ = model(**inputs)
            result = {
                "loss": loss,
-                "lm_logits": lm_logits
+                "lm_logits": lm_logits,
+                "mc_logits": mc_logits
            }
            self.parent.assertListEqual(
@@ -169,11 +197,17 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                [])
            self.parent.assertListEqual(
                list(result["lm_logits"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["mc_logits"].size()),
+                [self.batch_size, self.num_choices])
        def prepare_config_and_inputs_for_common(self):
            config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
+            (config, input_ids, input_mask, head_mask, token_type_ids,
+             mc_token_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
            inputs_dict = {
                'input_ids': input_ids,
                'token_type_ids': token_type_ids,
@@ -203,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
    @pytest.mark.slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)

--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -20,9 +20,13 @@ import unittest
 import pytest
 import shutil
+from transformers import is_torch_available
-from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+if is_torch_available():
-                                  OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+    from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                    OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -30,7 +34,7 @@ from .configuration_common_test import ConfigTester
 class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
-    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
+    all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
    class OpenAIGPTModelTester(object):
@@ -201,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
    @pytest.mark.slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
        for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)

--- a/pytorch_transformers/tests/modeling_roberta_test.py
+++ b/pytorch_transformers/tests/modeling_roberta_test.py
@@ -19,10 +19,15 @@ from __future__ import print_function
 import unittest
 import shutil
 import pytest
-import torch
-from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
+from transformers import is_torch_available
-from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+if is_torch_available():
+    import torch
+    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
+    from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
 from .modeling_common_test import (CommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
@@ -30,7 +35,7 @@ from .configuration_common_test import ConfigTester
 class RobertaModelTest(CommonTestCases.CommonModelTester):
-    all_model_classes = (RobertaForMaskedLM, RobertaModel)
+    all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
    class RobertaModelTester(object):
@@ -175,7 +180,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
    @pytest.mark.slow
    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        cache_dir = "/tmp/transformers_test/"
        for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
            shutil.rmtree(cache_dir)

--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import pytest
+import logging
+from transformers import is_tf_available
+if is_tf_available():
+    from transformers import (AutoConfig, BertConfig,
+                                      TFAutoModel, TFBertModel,
+                                      TFAutoModelWithLMHead, TFBertForMaskedLM,
+                                      TFAutoModelForSequenceClassification, TFBertForSequenceClassification,
+                                      TFAutoModelForQuestionAnswering, TFBertForQuestionAnswering)
+    from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    from .modeling_common_test import (CommonTestCases, ids_tensor)
+    from .configuration_common_test import ConfigTester
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+class TFAutoModelTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        import h5py
+        self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
+        logging.basicConfig(level=logging.INFO)
+        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['bert-base-uncased']:
+            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+            model = TFAutoModel.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertModel)
+    def test_lmhead_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['bert-base-uncased']:
+            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+            model = TFAutoModelWithLMHead.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForMaskedLM)
+    def test_sequence_classification_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['bert-base-uncased']:
+            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+            model = TFAutoModelForSequenceClassification.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForSequenceClassification)
+    def test_question_answering_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['bert-base-uncased']:
+            config = AutoConfig.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+            model = TFAutoModelForQuestionAnswering.from_pretrained(model_name, force_download=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForQuestionAnswering)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import shutil
+import pytest
+import sys
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+from transformers import BertConfig, is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_bert import (TFBertModel, TFBertForMaskedLM,
+                                                       TFBertForNextSentencePrediction,
+                                                       TFBertForPreTraining,
+                                                       TFBertForSequenceClassification,
+                                                       TFBertForMultipleChoice,
+                                                       TFBertForTokenClassification,
+                                                       TFBertForQuestionAnswering,
+                                                       TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
+    all_model_classes = (TFBertModel, TFBertForMaskedLM, TFBertForNextSentencePrediction,
+                         TFBertForPreTraining, TFBertForQuestionAnswering, TFBertForSequenceClassification,
+                         TFBertForTokenClassification) if is_tf_available() else ()
+    class TFBertModelTester(object):
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+            config = BertConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFBertModel(config=config)
+            # inputs = {'input_ids': input_ids,
+            #           'attention_mask': input_mask,
+            #           'token_type_ids': token_type_ids}
+            # sequence_output, pooled_output = model(**inputs)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output, pooled_output = model(inputs)
+            inputs = [input_ids, input_mask]
+            sequence_output, pooled_output = model(inputs)
+            sequence_output, pooled_output = model(input_ids)
+            result = {
+                "sequence_output": sequence_output.numpy(),
+                "pooled_output": pooled_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
+        def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFBertForMaskedLM(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores, = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+        def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFBertForNextSentencePrediction(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            seq_relationship_score, = model(inputs)
+            result = {
+                "seq_relationship_score": seq_relationship_score.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["seq_relationship_score"].shape),
+                [self.batch_size, 2])
+        def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFBertForPreTraining(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores, seq_relationship_score = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+                "seq_relationship_score": seq_relationship_score.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.parent.assertListEqual(
+                list(result["seq_relationship_score"].shape),
+                [self.batch_size, 2])
+        def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFBertForSequenceClassification(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.num_labels])
+        def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_choices = self.num_choices
+            model = TFBertForMultipleChoice(config=config)
+            multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1))
+            multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1))
+            multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1))
+            inputs = {'input_ids': multiple_choice_inputs_ids,
+                      'attention_mask': multiple_choice_input_mask,
+                      'token_type_ids': multiple_choice_token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.num_choices])
+        def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFBertForTokenClassification(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.seq_length, self.num_labels])
+        def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFBertForQuestionAnswering(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            start_logits, end_logits = model(inputs)
+            result = {
+                "start_logits": start_logits.numpy(),
+                "end_logits": end_logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].shape),
+                [self.batch_size, self.seq_length])
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+    def setUp(self):
+        self.model_tester = TFBertModelTest.TFBertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
+    def test_config(self):
+        self.config_tester.run_common_tests()
+    def test_bert_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_model(*config_and_inputs)
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
+    def test_for_next_sequence_prediction(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ['bert-base-uncased']:
+            model = TFBertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+if __name__ == "__main__":
+    unittest.main()
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+import copy
+import json
+import logging
+import importlib
+import random
+import shutil
+import unittest
+import uuid
+import pytest
+import sys
+from transformers import is_tf_available, is_torch_available
+if is_tf_available():
+    import tensorflow as tf
+    import numpy as np
+    from transformers import TFPreTrainedModel
+    # from transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+def _config_zero_init(config):
+    configs_no_init = copy.deepcopy(config)
+    for key in configs_no_init.__dict__.keys():
+        if '_range' in key or '_std' in key:
+            setattr(configs_no_init, key, 0.0)
+    return configs_no_init
+class TFCommonTestCases:
+    class TFCommonModelTester(unittest.TestCase):
+        model_tester = None
+        all_model_classes = ()
+        test_torchscript = True
+        test_pruning = True
+        test_resize_embeddings = True
+        def test_initialization(self):
+            pass
+            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            # configs_no_init = _config_zero_init(config)
+            # for model_class in self.all_model_classes:
+            #     model = model_class(config=configs_no_init)
+            #     for name, param in model.named_parameters():
+            #         if param.requires_grad:
+            #             self.assertIn(param.data.mean().item(), [0.0, 1.0],
+            #             msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
+        def test_pt_tf_model_equivalence(self):
+            if not is_torch_available():
+                return
+            import transformers
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            for model_class in self.all_model_classes:
+                pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beggining
+                pt_model_class = getattr(transformers, pt_model_class_name)
+                tf_model = model_class(config)
+                pt_model = pt_model_class(config)
+                tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict)
+                pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
+        def test_keyword_and_dict_args(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                outputs_dict = model(inputs_dict)
+                inputs_keywords = copy.deepcopy(inputs_dict)
+                input_ids = inputs_keywords.pop('input_ids')
+                outputs_keywords = model(input_ids, **inputs_keywords)
+                output_dict = outputs_dict[0].numpy()
+                output_keywords = outputs_keywords[0].numpy()
+                self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)
+        def test_attention_outputs(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            for model_class in self.all_model_classes:
+                config.output_attentions = True
+                config.output_hidden_states = False
+                model = model_class(config)
+                outputs = model(inputs_dict)
+                attentions = [t.numpy() for t in outputs[-1]]
+                self.assertEqual(model.config.output_attentions, True)
+                self.assertEqual(model.config.output_hidden_states, False)
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads,
+                    self.model_tester.seq_length,
+                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                out_len = len(outputs)
+                # Check attention is always last and order is fine
+                config.output_attentions = True
+                config.output_hidden_states = True
+                model = model_class(config)
+                outputs = model(inputs_dict)
+                self.assertEqual(out_len+1, len(outputs))
+                self.assertEqual(model.config.output_attentions, True)
+                self.assertEqual(model.config.output_hidden_states, True)
+                attentions = [t.numpy() for t in outputs[-1]]
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads,
+                    self.model_tester.seq_length,
+                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+        def test_headmasking(self):
+            pass
+            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            # config.output_attentions = True
+            # config.output_hidden_states = True
+            # configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
+            # for model_class in self.all_model_classes:
+            #     model = model_class(config=configs_no_init)
+            #     model.eval()
+            #     # Prepare head_mask
+            #     # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
+            #     head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
+            #     head_mask[0, 0] = 0
+            #     head_mask[-1, :-1] = 0
+            #     head_mask.requires_grad_(requires_grad=True)
+            #     inputs = inputs_dict.copy()
+            #     inputs['head_mask'] = head_mask
+            #     outputs = model(**inputs)
+            #     # Test that we can get a gradient back for importance score computation
+            #     output = sum(t.sum() for t in outputs[0])
+            #     output = output.sum()
+            #     output.backward()
+            #     multihead_outputs = head_mask.grad
+            #     attentions = outputs[-1]
+            #     hidden_states = outputs[-2]
+            #     # Remove Nan
+            #     self.assertIsNotNone(multihead_outputs)
+            #     self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
+            #     self.assertAlmostEqual(
+            #         attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
+            #     self.assertNotEqual(
+            #         attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
+            #     self.assertNotEqual(
+            #         attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
+            #     self.assertAlmostEqual(
+            #         attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
+            #     self.assertNotEqual(
+            #         attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
+        def test_head_pruning(self):
+            pass
+            # if not self.test_pruning:
+            #     return
+            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            # for model_class in self.all_model_classes:
+            #     config.output_attentions = True
+            #     config.output_hidden_states = False
+            #     model = model_class(config=config)
+            #     model.eval()
+            #     heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
+            #                     -1: [0]}
+            #     model.prune_heads(heads_to_prune)
+            #     outputs = model(**inputs_dict)
+            #     attentions = outputs[-1]
+            #     self.assertEqual(
+            #         attentions[0].shape[-3], 1)
+            #     self.assertEqual(
+            #         attentions[1].shape[-3], self.model_tester.num_attention_heads)
+            #     self.assertEqual(
+            #         attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
+        def test_hidden_states_output(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            for model_class in self.all_model_classes:
+                config.output_hidden_states = True
+                config.output_attentions = False
+                model = model_class(config)
+                outputs = model(inputs_dict)
+                hidden_states = [t.numpy() for t in outputs[-1]]
+                self.assertEqual(model.config.output_attentions, False)
+                self.assertEqual(model.config.output_hidden_states, True)
+                self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+        def test_resize_tokens_embeddings(self):
+            pass
+            # original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            # if not self.test_resize_embeddings:
+            #     return
+            # for model_class in self.all_model_classes:
+            #     config = copy.deepcopy(original_config)
+            #     model = model_class(config)
+            #     model_vocab_size = config.vocab_size
+            #     # Retrieve the embeddings and clone theme
+            #     model_embed = model.resize_token_embeddings(model_vocab_size)
+            #     cloned_embeddings = model_embed.weight.clone()
+            #     # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
+            #     model_embed = model.resize_token_embeddings(model_vocab_size + 10)
+            #     self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
+            #     # Check that it actually resizes the embeddings matrix
+            #     self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10)
+            #     # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
+            #     model_embed = model.resize_token_embeddings(model_vocab_size - 15)
+            #     self.assertEqual(model.config.vocab_size, model_vocab_size - 15)
+            #     # Check that it actually resizes the embeddings matrix
+            #     self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15)
+            #     # Check that adding and removing tokens has not modified the first part of the embedding matrix.
+            #     models_equal = True
+            #     for p1, p2 in zip(cloned_embeddings, model_embed.weight):
+            #         if p1.data.ne(p2.data).sum() > 0:
+            #             models_equal = False
+            #     self.assertTrue(models_equal)
+        def test_tie_model_weights(self):
+            pass
+            # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            # def check_same_values(layer_1, layer_2):
+            #     equal = True
+            #     for p1, p2 in zip(layer_1.weight, layer_2.weight):
+            #         if p1.data.ne(p2.data).sum() > 0:
+            #             equal = False
+            #     return equal
+            # for model_class in self.all_model_classes:
+            #     if not hasattr(model_class, 'tie_weights'):
+            #         continue
+            #     config.torchscript = True
+            #     model_not_tied = model_class(config)
+            #     params_not_tied = list(model_not_tied.parameters())
+            #     config_tied = copy.deepcopy(config)
+            #     config_tied.torchscript = False
+            #     model_tied = model_class(config_tied)
+            #     params_tied = list(model_tied.parameters())
+            #     # Check that the embedding layer and decoding layer are the same in size and in value
+            #     self.assertGreater(len(params_not_tied), len(params_tied))
+            #     # Check that after resize they remain tied.
+            #     model_tied.resize_token_embeddings(config.vocab_size + 10)
+            #     params_tied_2 = list(model_tied.parameters())
+            #     self.assertGreater(len(params_not_tied), len(params_tied))
+            #     self.assertEqual(len(params_tied_2), len(params_tied))
+        def test_determinism(self):
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            for model_class in self.all_model_classes:
+                model = model_class(config)
+                first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
+                self.assertTrue(tf.math.equal(first, second).numpy().all())
+def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
+    """Creates a random int32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = random.Random()
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+    output = tf.constant(values,
+                         shape=shape,
+                         dtype=dtype if dtype is not None else tf.int32)
+    return output
+class TFModelUtilsTest(unittest.TestCase):
+    @pytest.mark.skipif('tensorflow' not in sys.modules, reason="requires TensorFlow")
+    def test_model_from_pretrained(self):
+        pass
+        # logging.basicConfig(level=logging.INFO)
+        # for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        #     config = BertConfig.from_pretrained(model_name)
+        #     self.assertIsNotNone(config)
+        #     self.assertIsInstance(config, PretrainedConfig)
+        #     model = BertModel.from_pretrained(model_name)
+        #     model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
+        #     self.assertIsNotNone(model)
+        #     self.assertIsInstance(model, PreTrainedModel)
+        #     for value in loading_info.values():
+        #         self.assertEqual(len(value), 0)
+        #     config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+        #     model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
+        #     self.assertEqual(model.config.output_attentions, True)
+        #     self.assertEqual(model.config.output_hidden_states, True)
+        #     self.assertEqual(model.config, config)
+if __name__ == "__main__":
+    unittest.main()