Merge pull request #781 from huggingface/embeddings

Clean up input embeddings resizing and weights tying

Merge pull request #781 from huggingface/embeddings
Clean up input embeddings resizing and weights tying
292140b9 · Thomas Wolf · GitHub · 3821ecbf · c57e9d94 · 292140b9
Unverified Commit 292140b9 authored Jul 12, 2019 by Thomas Wolf Committed by GitHub Jul 12, 2019
16 changed files
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -507,23 +507,17 @@ class BertPredictionHeadTransform(nn.Module):
 class BertLMPredictionHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
        super(BertLMPredictionHead, self).__init__()
        self.transform = BertPredictionHeadTransform(config)
-        self.torchscript = config.torchscript
        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
-        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+        self.decoder = nn.Linear(config.hidden_size,
-                                 bert_model_embedding_weights.size(0),
+                                 config.vocab_size,
                                 bias=False)
-        if self.torchscript:
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-            self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone())
-        else:
-            self.decoder.weight = bert_model_embedding_weights
-        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
@@ -532,9 +526,9 @@ class BertLMPredictionHead(nn.Module):
 class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
        super(BertOnlyMLMHead, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.predictions = BertLMPredictionHead(config)
    def forward(self, sequence_output):
        prediction_scores = self.predictions(sequence_output)
@@ -552,9 +546,9 @@ class BertOnlyNSPHead(nn.Module):
 class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config, bert_model_embedding_weights):
+    def __init__(self, config):
        super(BertPreTrainingHeads, self).__init__()
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.predictions = BertLMPredictionHead(config)
        self.seq_relationship = nn.Linear(config.hidden_size, 2)
    def forward(self, sequence_output, pooled_output):
@@ -619,6 +613,12 @@ class BertModel(BertPreTrainedModel):
        self.apply(self.init_weights)
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+        return self.embeddings.word_embeddings
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
@@ -750,9 +750,17 @@ class BertForPreTraining(BertPreTrainedModel):
        super(BertForPreTraining, self).__init__(config)
        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.cls = BertPreTrainingHeads(config)
        self.apply(self.init_weights)
+        self.tie_weights()
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
+                                   self.bert.embeddings.word_embeddings)
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
                next_sentence_label=None, head_mask=None):
@@ -845,9 +853,17 @@ class BertForMaskedLM(BertPreTrainedModel):
        super(BertForMaskedLM, self).__init__(config)
        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.cls = BertOnlyMLMHead(config)
        self.apply(self.init_weights)
+        self.tie_weights()
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
+                                   self.bert.embeddings.word_embeddings)
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
        """

--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -104,7 +104,6 @@ class GPT2Config(PretrainedConfig):
    Args:
        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
        n_positions: Number of positional embeddings.
        n_ctx: Size of the causal mask (usually same as n_positions).
        n_embd: Dimensionality of the embeddings and hidden states.
@@ -119,14 +118,12 @@ class GPT2Config(PretrainedConfig):
        embd_pdrop: The dropout ratio for the embeddings.
        initializer_range: The sttdev of the truncated_normal_initializer for
            initializing all weight matrices.
-        predict_special_tokens: should we predict special tokens (when the model has a LM head)
    """
    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
    def __init__(
        self,
        vocab_size_or_config_json_file=50257,
-        n_special=0,
        n_positions=1024,
        n_ctx=1024,
        n_embd=768,
@@ -137,7 +134,6 @@ class GPT2Config(PretrainedConfig):
        attn_pdrop=0.1,
        layer_norm_epsilon=1e-5,
        initializer_range=0.02,
-        predict_special_tokens=True,
        num_labels=1,
        summary_type='token_ids',
@@ -151,7 +147,6 @@ class GPT2Config(PretrainedConfig):
        Args:
            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
            n_positions: Number of positional embeddings.
            n_ctx: Size of the causal mask (usually same as n_positions).
            n_embd: Dimensionality of the embeddings and hidden states.
@@ -166,7 +161,6 @@ class GPT2Config(PretrainedConfig):
            embd_pdrop: The dropout ratio for the embeddings.
            initializer_range: The sttdev of the truncated_normal_initializer for
                initializing all weight matrices.
-            predict_special_tokens: should we predict special tokens (when the model has a LM head)
        """
        super(GPT2Config, self).__init__(**kwargs)
@@ -178,7 +172,6 @@ class GPT2Config(PretrainedConfig):
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.vocab_size = vocab_size_or_config_json_file
-            self.n_special = n_special
            self.n_ctx = n_ctx
            self.n_positions = n_positions
            self.n_embd = n_embd
@@ -189,7 +182,6 @@ class GPT2Config(PretrainedConfig):
            self.attn_pdrop = attn_pdrop
            self.layer_norm_epsilon = layer_norm_epsilon
            self.initializer_range = initializer_range
-            self.predict_special_tokens = predict_special_tokens
            self.num_labels = num_labels
            self.summary_type = summary_type
@@ -203,10 +195,6 @@ class GPT2Config(PretrainedConfig):
                "or the path to a pretrained model config file (str)"
            )
-    @property
-    def total_tokens_embeddings(self):
-        return self.vocab_size + self.n_special
    @property
    def hidden_size(self):
        return self.n_embd
@@ -347,34 +335,6 @@ class Block(nn.Module):
        return outputs  # x, present, (attentions)
-class GPT2LMHead(nn.Module):
-    """ Language Model Head for the transformer """
-    def __init__(self, model_embeddings_weights, config):
-        super(GPT2LMHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.vocab_size = config.vocab_size
-        self.predict_special_tokens = config.predict_special_tokens
-        self.torchscript = config.torchscript
-        embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.set_embeddings_weights(model_embeddings_weights)
-    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
-        self.predict_special_tokens = predict_special_tokens
-        # Export to TorchScript can't handle parameter sharing so we are cloning them.
-        if self.torchscript:
-            self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
-        else:
-            self.decoder.weight = model_embeddings_weights  # Tied weights
-    def forward(self, hidden_state):
-        lm_logits = self.decoder(hidden_state)
-        if not self.predict_special_tokens:
-            lm_logits = lm_logits[..., :self.vocab_size]
-        return lm_logits
 class GPT2PreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
@@ -400,36 +360,6 @@ class GPT2PreTrainedModel(PreTrainedModel):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `gpt2`
-                - a path or url to a pretrained model archive containing:
-                    . `gpt2_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
-                - a path or url to a pretrained model archive containing:
-                    . `gpt2_config.json` a configuration file for the model
-                    . a TensorFlow checkpoint with trained weights
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific GPT2 class
-        """
-        num_special_tokens = kwargs.pop('num_special_tokens', None)
-        model = super(GPT2PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        # Add additional embeddings for special tokens if needed
-        # This step also make sure we are still sharing the output and input embeddings after loading weights
-        model.set_num_special_tokens(num_special_tokens)
-        return model
 class GPT2Model(GPT2PreTrainedModel):
    """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
@@ -447,13 +377,13 @@ class GPT2Model(GPT2PreTrainedModel):
         config.vocab_size - 1,                                     ______________________
         config.vocab_size,
         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to
+    where total_tokens_embeddings is equal to
    ::
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = vocab_size + n_special
    You should use the associated indices to index the embeddings.
@@ -474,7 +404,7 @@ class GPT2Model(GPT2PreTrainedModel):
        self.output_hidden_states = config.output_hidden_states
        self.output_attentions = config.output_attentions
-        self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
        self.drop = nn.Dropout(config.embd_pdrop)
        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
@@ -482,26 +412,9 @@ class GPT2Model(GPT2PreTrainedModel):
        self.apply(self.init_weights)
-    def set_num_special_tokens(self, num_special_tokens=None):
+    def _resize_token_embeddings(self, new_num_tokens):
-        """
+        self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
-        Update input embeddings with new embedding matrix if needed.
+        return self.wte
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-        TODO Lysandre filled args
-        """
-        if num_special_tokens is None or self.config.n_special == num_special_tokens:
-            return
-        # Update config
-        self.config.n_special = num_special_tokens
-        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
-        old_embed = self.wte
-        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        self.wte.to(old_embed.weight.device)
-        self.init_weights(self.wte)
-        # Copy word embeddings from the previous weights
-        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -641,23 +554,17 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
    def __init__(self, config):
        super(GPT2LMHeadModel, self).__init__(config)
        self.transformer = GPT2Model(config)
-        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        self.apply(self.init_weights)
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """
-        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
-        Args:
+        self.apply(self.init_weights)
-            num_special_tokens: Special tokens to be added to the embedding matrix
+        self.tie_weights()
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-        TODO Lysandre filled args
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self._tie_or_clone_weights(self.lm_head,
-        self.transformer.set_num_special_tokens(num_special_tokens)
+                                   self.transformer.wte)
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
        """
@@ -740,25 +647,17 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    def __init__(self, config):
        super(GPT2DoubleHeadsModel, self).__init__(config)
        self.transformer = GPT2Model(config)
-        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)
        self.apply(self.init_weights)
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
-        """
+        """ Make sure we are sharing the input and output embeddings.
-        Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-        TODO Lysandre filled args
        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self._tie_or_clone_weights(self.lm_head,
-        self.transformer.set_num_special_tokens(num_special_tokens)
+                                   self.transformer.wte)
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, past=None, head_mask=None):

--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -156,7 +156,6 @@ class OpenAIGPTConfig(PretrainedConfig):
    def __init__(
        self,
        vocab_size_or_config_json_file=40478,
-        n_special=0,
        n_positions=512,
        n_ctx=512,
        n_embd=768,
@@ -190,7 +189,6 @@ class OpenAIGPTConfig(PretrainedConfig):
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.vocab_size = vocab_size_or_config_json_file
-            self.n_special = n_special
            self.n_ctx = n_ctx
            self.n_positions = n_positions
            self.n_embd = n_embd
@@ -216,10 +214,6 @@ class OpenAIGPTConfig(PretrainedConfig):
                "or the path to a pretrained model config file (str)"
            )
-    @property
-    def total_tokens_embeddings(self):
-        return self.vocab_size + self.n_special
    @property
    def hidden_size(self):
        return self.n_embd
@@ -355,34 +349,6 @@ class Block(nn.Module):
        return outputs
-class OpenAIGPTLMHead(nn.Module):
-    """ Language Model Head for the transformer """
-    def __init__(self, model_embeddings_weights, config):
-        super(OpenAIGPTLMHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.vocab_size = config.vocab_size
-        self.predict_special_tokens = config.predict_special_tokens
-        self.torchscript = config.torchscript
-        embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.set_embeddings_weights(model_embeddings_weights)
-    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
-        self.predict_special_tokens = predict_special_tokens
-        if self.torchscript:
-            self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
-        else:
-            self.decoder.weight = model_embeddings_weights  # Tied weights
-    def forward(self, hidden_state):
-        lm_logits = self.decoder(hidden_state)
-        if not self.predict_special_tokens:
-            lm_logits = lm_logits[..., :self.vocab_size]
-        return lm_logits
 class OpenAIGPTPreTrainedModel(PreTrainedModel):
    """ An abstract class to handle weights initialization and
        a simple interface for dowloading and loading pretrained models.
@@ -408,36 +374,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                - a path or url to a pretrained model archive containing:
-                    . `config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
-                - a path or url to a pretrained model archive containing:
-                    . `config.json` a configuration file for the model
-                    . a series of NumPy files containing OpenAI TensorFlow trained weights
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific OpenAI-GPT class
-        """
-        num_special_tokens = kwargs.get('num_special_tokens', None)
-        kwargs.pop('num_special_tokens', None)
-        model = super(OpenAIGPTPreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs)
-        # Add additional embeddings for special tokens if needed
-        # This step also make sure we are still sharing the output and input embeddings after loading weights
-        model.set_num_special_tokens(num_special_tokens)
-        return model
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
@@ -457,13 +393,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
         config.vocab_size - 1,                                     ______________________
         config.vocab_size,
         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
-    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
+    where ``total_tokens_embeddings``  is:
    ::
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = config.vocab_size + n_special
    You should use the associated indices to index the embeddings.
@@ -485,34 +421,16 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        self.output_attentions = config.output_attentions
        self.output_hidden_states = config.output_hidden_states
-        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
+        self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
        self.drop = nn.Dropout(config.embd_pdrop)
        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
        self.apply(self.init_weights)
-    def set_num_special_tokens(self, num_special_tokens=None):
+    def _resize_token_embeddings(self, new_num_tokens):
-        """
+        self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
-        Update input embeddings with new embedding matrice if needed
+        return self.tokens_embed
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-        TODO Lysandre filled Args
-        """
-        if num_special_tokens is None or self.config.n_special == num_special_tokens:
-            return
-        # Update config
-        self.config.n_special = num_special_tokens
-        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
-        old_embed = self.tokens_embed
-        self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        self.tokens_embed.to(old_embed.weight.device)
-        self.init_weights(self.tokens_embed)
-        # Copy word embeddings from the previous weights
-        self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
@@ -657,24 +575,17 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    def __init__(self, config):
        super(OpenAIGPTLMHeadModel, self).__init__(config)
        self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        self.apply(self.init_weights)
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """
-        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
-        Args:
+        self.apply(self.init_weights)
-            num_special_tokens: Special tokens to be added to the embedding matrix
+        self.tie_weights()
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-        TODO Lysandre filled Args
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self._tie_or_clone_weights(self.lm_head,
-        self.transformer.set_num_special_tokens(num_special_tokens)
+                                   self.transformer.tokens_embed)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
        """
@@ -747,13 +658,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         config.vocab_size - 1,                                     ______________________
         config.vocab_size,
         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
+         config.vocab_size + n_special - 1]                  ______________________
-    where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
+    where ``total_tokens_embeddings`` is:
    ::
-        total_tokens_embeddings = config.vocab_size + config.n_special
+        total_tokens_embeddings = config.vocab_size + .n_special
    You should use the associate indices to index the embeddings.
@@ -773,24 +684,18 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
        self.transformer = OpenAIGPTModel(config)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)
        self.apply(self.init_weights)
+        self.tie_weights()
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
+    def tie_weights(self):
-        """ Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
-        Args:
-            num_special_tokens: Special tokens to be added to the embedding matrix
-            predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
-                Defaults to True.
-        TODO Lysandre filled Args
        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
+        self._tie_or_clone_weights(self.lm_head,
-        self.transformer.set_num_special_tokens(num_special_tokens)
+                                   self.transformer.tokens_embed)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, head_mask=None):

--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -287,6 +287,14 @@ class TransfoXLConfig(PretrainedConfig):
            raise ValueError("First argument must be either a vocabulary size (int)"
                             "or the path to a pretrained model config file (str)")
+    @property
+    def vocab_size(self):
+        return self.n_token
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_token = value
    @property
    def hidden_size(self):
        return self.d_model
@@ -998,6 +1006,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        self.apply(self.init_weights)
+    def _resize_token_embeddings(self, new_num_tokens):
+        return self.word_emb
    def backward_compatible(self):
        self.sample_softmax = -1
@@ -1273,13 +1284,20 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
        else:
            if self.config.tie_weight:
                for i in range(len(self.crit.out_layers)):
-                    self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight
+                    self._tie_or_clone_weights(self.crit.out_layers[i],
+                                               self.transformer.word_emb.emb_layers[i])
            if self.config.tie_projs:
                for i, tie_proj in enumerate(self.config.tie_projs):
                    if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
-                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                        if self.config.torchscript:
+                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
+                        else:
+                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
                    elif tie_proj and self.config.div_val != 1:
-                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+                        if self.config.torchscript:
+                            self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
+                        else:
+                            self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
    def reset_length(self, tgt_len, ext_len, mem_len):
        self.transformer.reset_length(tgt_len, ext_len, mem_len)

--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -151,6 +151,7 @@ class PreTrainedModel(nn.Module):
    pretrained_model_archive_map = {}
    load_tf_weights = lambda model, config, path: None
    base_model_prefix = ""
+    input_embeddings = None
    def __init__(self, config, *inputs, **kwargs):
        super(PreTrainedModel, self).__init__()
@@ -164,12 +165,79 @@ class PreTrainedModel(nn.Module):
        # Save config in model
        self.config = config
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
+        """ Build a resized Embedding Module from a provided token Embedding Module.
+            Increasing the size will add newly initialized vectors at the end
+            Reducing the size will remove vectors from the end
+        Args:
+            new_num_tokens: (Optional) New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+                If not provided or None: return the provided token Embedding Module.
+        Return:
+            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
+        """
+        if new_num_tokens is None:
+            return old_embeddings
+        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        if old_num_tokens == new_num_tokens:
+            return old_embeddings
+        # Build new embeddings
+        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
+        new_embeddings.to(old_embeddings.weight.device)
+        # initialize all new embeddings (in particular added tokens)
+        self.init_weights(new_embeddings)
+        # Copy word embeddings from the previous weights
+        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+        return new_embeddings
+    def _tie_or_clone_weights(self, first_module, second_module):
+        """ Tie or clone module weights depending of weither we are using TorchScript or not
+        """
+        if self.config.torchscript:
+            first_module.weight = nn.Parameter(second_module.weight.clone())
+        else:
+            first_module.weight = second_module.weight
+    def resize_token_embeddings(self, new_num_tokens=None):
+        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
+        Args:
+            new_num_tokens: (Optional) New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+                If not provided or None: does nothing.
+        Return:
+            Pointer to the input tokens Embedding Module of the model
+        """
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
+        if new_num_tokens is None:
+            return model_embeds
+        # Update base model and current model config
+        self.config.vocab_size = new_num_tokens
+        base_model.vocab_size = new_num_tokens
+        # Tie weights again if needed
+        if hasattr(self, 'tie_weights'):
+            self.tie_weights()
+        return model_embeds
    def prune_heads(self, heads_to_prune):
        """ Prunes heads of the base model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        """
-        model_to_prune = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
-        model_to_prune._prune_heads(heads_to_prune)
+        base_model._prune_heads(heads_to_prune)
    def save_pretrained(self, save_directory):
        """ Save a model with its configuration file to a directory, so that it

--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -104,7 +104,6 @@ class XLMConfig(PretrainedConfig):
    def __init__(self,
                 vocab_size_or_config_json_file=30145,
-                 n_special=0,
                 emb_dim=2048,
                 n_layers=12,
                 n_heads=16,
@@ -148,7 +147,6 @@ class XLMConfig(PretrainedConfig):
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.n_words = vocab_size_or_config_json_file
-            self.n_special = n_special
            self.emb_dim = emb_dim
            self.n_layers = n_layers
            self.n_heads = n_heads
@@ -183,8 +181,12 @@ class XLMConfig(PretrainedConfig):
                             "or the path to a pretrained model config file (str)")
    @property
-    def total_tokens_embeddings(self):
+    def vocab_size(self):
-        return self.n_words + self.n_special
+        return self.n_words
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_words = value
    @property
    def hidden_size(self):
@@ -479,6 +481,10 @@ class XLMModel(XLMPreTrainedModel):
        self.apply(self.init_weights)
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
+        return self.embeddings
    def _prune_heads(self, heads_to_prune):
        """ Prunes heads of the model.
            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
@@ -718,8 +724,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
    """
    def __init__(self, config):
        super(XLMWithLMHeadModel, self).__init__(config)
-        self.torchscript = config.torchscript
        self.transformer = XLMModel(config)
        self.pred_layer = XLMPredLayer(config)
@@ -729,10 +733,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
    def tie_weights(self):
        """ Make sure we are sharing the embeddings
        """
-        if self.torchscript:
+        self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
-            self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
-        else:
-            self.pred_layer.proj.weight = self.transformer.embeddings.weight
    def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
                attention_mask=None, cache=None, labels=None, head_mask=None):

--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -312,6 +312,14 @@ class XLNetConfig(PretrainedConfig):
            raise ValueError("First argument must be either a vocabulary size (int)"
                             "or the path to a pretrained model config file (str)")
+    @property
+    def vocab_size(self):
+        return self.n_token
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_token = value
    @property
    def hidden_size(self):
        return self.d_model
@@ -654,9 +662,12 @@ class XLNetModel(XLNetPreTrainedModel):
        self.apply(self.init_weights)
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
+        return self.word_embedding
    def _prune_heads(self, heads_to_prune):
-        logger.info("Head pruning is not implemented for XLNet")
+        raise NotImplementedError
-        pass
    def create_mask(self, qlen, mlen):
        """
@@ -970,23 +981,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
        super(XLNetLMHeadModel, self).__init__(config)
        self.attn_type = config.attn_type
        self.same_length = config.same_length
-        self.torchscript = config.torchscript
        self.transformer = XLNetModel(config)
        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
-        # Tie weights
        self.apply(self.init_weights)
        self.tie_weights()
    def tie_weights(self):
        """ Make sure we are sharing the embeddings
        """
-        if self.torchscript:
+        self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
-            self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
-        else:
-            self.lm_loss.weight = self.transformer.word_embedding.weight
    def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
                mems=None, perm_mask=None, target_mapping=None, inp_q=None,

--- a/pytorch_transformers/tests/modeling_bert_test.py
+++ b/pytorch_transformers/tests/modeling_bert_test.py
@@ -26,10 +26,15 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
                                     BertForTokenClassification, BertForMultipleChoice)
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
-class BertModelTest(unittest.TestCase):
+class BertModelTest(CommonTestCases.CommonModelTester):
+    all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
+            BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+            BertForTokenClassification)
    class BertModelTester(object):
        def __init__(self,
@@ -55,9 +60,6 @@ class BertModelTest(unittest.TestCase):
                     num_labels=3,
                     num_choices=4,
                     scope=None,
-                     all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
-                             BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-                             BertForTokenClassification),
                    ):
            self.parent = parent
            self.batch_size = batch_size
@@ -81,7 +83,6 @@ class BertModelTest(unittest.TestCase):
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.scope = scope
-            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -253,52 +254,59 @@ class BertModelTest(unittest.TestCase):
            self.check_loss_output(result)
-        def create_and_check_bert_commons(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
-            create_and_check_commons(self, config, inputs_dict)
+            return config, inputs_dict
-    def test_default(self):
+    def setUp(self):
-        self.run_tester(BertModelTest.BertModelTester(self))
+        self.model_tester = BertModelTest.BertModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
+        self.config_tester.run_common_tests()
-        config_tester.run_common_tests()
-    @pytest.mark.slow
+    def test_bert_model(self):
-    def test_model_from_pretrained(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        cache_dir = "/tmp/pytorch_transformers_test/"
+        self.model_tester.create_and_check_bert_model(*config_and_inputs)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(model)
-    def run_tester(self, tester):
+    def test_for_masked_lm(self):
-        config_and_inputs = tester.prepare_config_and_inputs()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        tester.create_and_check_bert_model(*config_and_inputs)
+        self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
-        config_and_inputs = tester.prepare_config_and_inputs()
+    def test_for_multiple_choice(self):
-        tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
-        config_and_inputs = tester.prepare_config_and_inputs()
+    def test_for_next_sequence_prediction(self):
-        tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
-        config_and_inputs = tester.prepare_config_and_inputs()
+    def test_for_pretraining(self):
-        tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
-        config_and_inputs = tester.prepare_config_and_inputs()
+    def test_for_question_answering(self):
-        tester.create_and_check_bert_for_pretraining(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
-        config_and_inputs = tester.prepare_config_and_inputs()
+    def test_for_sequence_classification(self):
-        tester.create_and_check_bert_for_question_answering(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
-        config_and_inputs = tester.prepare_config_and_inputs()
+    def test_for_token_classification(self):
-        tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
-        config_and_inputs = tester.prepare_config_and_inputs()
+    @pytest.mark.slow
-        tester.create_and_check_bert_for_token_classification(*config_and_inputs)
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
-        config_and_inputs = tester.prepare_config_and_inputs()
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-        tester.create_and_check_bert_commons(*config_and_inputs)
+            model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
--- a/pytorch_transformers/tests/modeling_gpt2_test.py
+++ b/pytorch_transformers/tests/modeling_gpt2_test.py
@@ -16,19 +16,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import os
 import unittest
-import json
-import random
-import shutil
 import pytest
-import torch
 from pytorch_transformers import (GPT2Config, GPT2Model,
-                                     GPT2LMHeadModel, GPT2DoubleHeadsModel)
+                                  GPT2LMHeadModel, GPT2DoubleHeadsModel)
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import CommonTestCases, ConfigTester
 class GPT2ModelTest(unittest.TestCase):
@@ -37,14 +32,14 @@ class GPT2ModelTest(unittest.TestCase):
        config_tester.run_common_tests()
    def test_model(self):
-        model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
                                            lm_head_model_class=GPT2LMHeadModel,
                                            double_head_model_class=GPT2DoubleHeadsModel)
        model_tester.run_common_tests(test_presents=True)
    @pytest.mark.slow
    def test_pretrained(self):
-        model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
                                            lm_head_model_class=GPT2LMHeadModel,
                                            double_head_model_class=GPT2DoubleHeadsModel)
        model_tester.run_slow_tests()

--- a/pytorch_transformers/tests/modeling_openai_test.py
+++ b/pytorch_transformers/tests/modeling_openai_test.py
@@ -19,12 +19,11 @@ from __future__ import print_function
 import unittest
 import pytest
-import torch
 from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
                                     OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
+from .modeling_common_test import CommonTestCases, ConfigTester
 class OpenAIModelTest(unittest.TestCase):
@@ -33,14 +32,14 @@ class OpenAIModelTest(unittest.TestCase):
        config_tester.run_common_tests()
    def test_model(self):
-        model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
                                           lm_head_model_class=OpenAIGPTLMHeadModel,
                                           double_head_model_class=OpenAIGPTDoubleHeadsModel)
        model_tester.run_common_tests(test_presents=False)
    @pytest.mark.slow
    def test_pretrained(self):
-        model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
+        model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
                                           lm_head_model_class=OpenAIGPTLMHeadModel,
                                           double_head_model_class=OpenAIGPTDoubleHeadsModel)
        model_tester.run_slow_tests()

--- a/pytorch_transformers/tests/modeling_tests_commons.py
+++ b/pytorch_transformers/tests/modeling_tests_commons.py
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import copy
-import os
-import shutil
-import json
-import random
-import torch
-def _config_zero_init(config):
-    configs_no_init = copy.deepcopy(config)
-    for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key:
-            setattr(configs_no_init, key, 0.0)
-    return configs_no_init
-def _create_and_check_torchscript_output_attentions(tester, model_classes, config, inputs_dict):
-    config.output_attentions = True
-    _create_and_check_torchscript(tester, model_classes, config, inputs_dict)
-def _create_and_check_torchscript_output_hidden_state(tester, model_classes, config, inputs_dict):
-    config.output_hidden_states = True
-    _create_and_check_torchscript(tester, model_classes, config, inputs_dict)
-def _create_and_check_torchscript(tester, model_classes, config, inputs_dict):
-    configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-    configs_no_init.torchscript = True
-    for model_class in model_classes:
-        model = model_class(config=configs_no_init)
-        model.eval()
-        inputs = inputs_dict['input_ids']  # Let's keep only input_ids
-        try:
-            torch.jit.trace(model, inputs)
-        except RuntimeError:
-            tester.parent.fail("Couldn't trace module.")
-        try:
-            traced_gpt2 = torch.jit.trace(model, inputs)
-            torch.jit.save(traced_gpt2, "traced_model.pt")
-        except RuntimeError:
-            tester.parent.fail("Couldn't save module.")
-        try:
-            loaded_model = torch.jit.load("traced_model.pt")
-            os.remove("traced_model.pt")
-        except ValueError:
-            tester.parent.fail("Couldn't load module.")
-        model.eval()
-        loaded_model.eval()
-        model_params = model.parameters()
-        loaded_model_params = loaded_model.parameters()
-        models_equal = True
-        for p1, p2 in zip(model_params, loaded_model_params):
-            if p1.data.ne(p2.data).sum() > 0:
-                models_equal = False
-        tester.parent.assertTrue(models_equal)
-def _create_and_check_initialization(tester, model_classes, config, inputs_dict):
-    configs_no_init = _config_zero_init(config)
-    for model_class in model_classes:
-        model = model_class(config=configs_no_init)
-        for name, param in model.named_parameters():
-            if param.requires_grad:
-                tester.parent.assertIn(param.data.mean().item(), [0.0, 1.0],
-                                       msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
-def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict):
-    configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
-    for model_class in model_classes:
-        config.output_attentions = True
-        config.output_hidden_states = True
-        model = model_class(config=configs_no_init)
-        model.eval()
-        # Prepare head_mask
-        # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) 
-        head_mask = torch.ones(tester.num_hidden_layers, tester.num_attention_heads)
-        head_mask[0, 0] = 0
-        head_mask[-1, :-1] = 0
-        head_mask.requires_grad_(requires_grad=True)
-        inputs = inputs_dict.copy()
-        inputs['head_mask'] = head_mask
-        outputs = model(**inputs)
-        # Test that we can get a gradient back for importance score computation
-        output = sum(t.sum() for t in outputs[0])
-        output = output.sum()
-        output.backward()
-        multihead_outputs = head_mask.grad
-        attentions = outputs[-1]
-        hidden_states = outputs[-2]
-        # Remove Nan
-        tester.parent.assertIsNotNone(multihead_outputs)
-        tester.parent.assertEqual(len(multihead_outputs), tester.num_hidden_layers)
-        tester.parent.assertAlmostEqual(
-            attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertNotEqual(
-            attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertNotEqual(
-            attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertAlmostEqual(
-            attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
-        tester.parent.assertNotEqual(
-            attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
-def _create_and_check_for_head_pruning(tester, model_classes, config, inputs_dict):
-    for model_class in model_classes:
-        config.output_attentions = True
-        config.output_hidden_states = False
-        model = model_class(config=config)
-        model.eval()
-        heads_to_prune = {0: list(range(1, tester.num_attention_heads)),
-                          -1: [0]}
-        model.prune_heads(heads_to_prune)
-        outputs = model(**inputs_dict)
-        attentions = outputs[-1]
-        tester.parent.assertEqual(
-            attentions[0].shape[-3], 1)
-        tester.parent.assertEqual(
-            attentions[1].shape[-3], tester.num_attention_heads)
-        tester.parent.assertEqual(
-            attentions[-1].shape[-3], tester.num_attention_heads - 1)
-def _create_and_check_for_attentions(tester, model_classes, config, inputs_dict):
-    for model_class in model_classes:
-        config.output_attentions = True
-        config.output_hidden_states = False
-        model = model_class(config)
-        model.eval()
-        outputs = model(**inputs_dict)
-        attentions = outputs[-1]
-        tester.parent.assertEqual(model.config.output_attentions, True)
-        tester.parent.assertEqual(model.config.output_hidden_states, False)
-        tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
-        tester.parent.assertListEqual(
-            list(attentions[0].shape[-3:]),
-            [tester.num_attention_heads,
-             tester.seq_length,
-             tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
-        out_len = len(outputs)
-        # Check attention is always last and order is fine
-        config.output_attentions = True
-        config.output_hidden_states = True
-        model = model_class(config)
-        model.eval()
-        outputs = model(**inputs_dict)
-        tester.parent.assertEqual(out_len+1, len(outputs))
-        tester.parent.assertEqual(model.config.output_attentions, True)
-        tester.parent.assertEqual(model.config.output_hidden_states, True)
-        attentions = outputs[-1]
-        tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
-        tester.parent.assertListEqual(
-            list(attentions[0].shape[-3:]),
-            [tester.num_attention_heads,
-             tester.seq_length,
-             tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
-def _create_and_check_for_hidden_states(tester, model_classes, config, inputs_dict):
-    for model_class in model_classes:
-        config.output_hidden_states = True
-        config.output_attentions = False
-        model = model_class(config)
-        model.eval()
-        outputs = model(**inputs_dict)
-        hidden_states = outputs[-1]
-        tester.parent.assertEqual(model.config.output_attentions, False)
-        tester.parent.assertEqual(model.config.output_hidden_states, True)
-        tester.parent.assertEqual(len(hidden_states), tester.num_hidden_layers + 1)
-        tester.parent.assertListEqual(
-            list(hidden_states[0].shape[-2:]),
-            [tester.seq_length, tester.hidden_size])
-def create_and_check_commons(tester, config, inputs_dict, test_pruning=True, test_torchscript=True):
-    _create_and_check_initialization(tester, tester.all_model_classes, config, inputs_dict)
-    _create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
-    _create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
-    _create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)
-    if test_torchscript:
-        _create_and_check_torchscript(tester, tester.all_model_classes, config, inputs_dict)
-        _create_and_check_torchscript_output_attentions(tester, tester.all_model_classes, config, inputs_dict)
-        _create_and_check_torchscript_output_hidden_state(tester, tester.all_model_classes, config, inputs_dict)
-    if test_pruning:
-        _create_and_check_for_head_pruning(tester, tester.all_model_classes, config, inputs_dict)
-def ids_tensor(shape, vocab_size, rng=None, name=None):
-    """Creates a random int32 tensor of the shape within the vocab size."""
-    if rng is None:
-        rng = random.Random()
-    total_dims = 1
-    for dim in shape:
-        total_dims *= dim
-    values = []
-    for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
-    return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
-class ConfigTester(object):
-    def __init__(self, parent, config_class=None, **kwargs):
-        self.parent = parent
-        self.config_class = config_class
-        self.inputs_dict = kwargs
-    def create_and_test_config_common_properties(self):
-        config = self.config_class(**self.inputs_dict)
-        self.parent.assertTrue(hasattr(config, 'hidden_size'))
-        self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
-        self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
-    def create_and_test_config_to_json_string(self):
-        config = self.config_class(**self.inputs_dict)
-        obj = json.loads(config.to_json_string())
-        for key, value in self.inputs_dict.items():
-            self.parent.assertEqual(obj[key], value)
-    def create_and_test_config_to_json_file(self):
-        config_first = self.config_class(**self.inputs_dict)
-        json_file_path = "/tmp/config.json"
-        config_first.to_json_file(json_file_path)
-        config_second = self.config_class.from_json_file(json_file_path)
-        os.remove(json_file_path)
-        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
-    def run_common_tests(self):
-        self.create_and_test_config_common_properties()
-        self.create_and_test_config_to_json_string()
-        self.create_and_test_config_to_json_file()
-class GPTModelTester(object):
-    def __init__(self,
-                    parent,
-                    batch_size=13,
-                    seq_length=7,
-                    is_training=True,
-                    use_position_ids=True,
-                    use_token_type_ids=True,
-                    use_labels=True,
-                    vocab_size=99,
-                    n_special=1,
-                    n_positions=33,
-                    hidden_size=32,
-                    num_hidden_layers=5,
-                    num_attention_heads=4,
-                    n_choices=3,
-                    type_sequence_label_size=2,
-                    initializer_range=0.02,
-                    num_labels=3,
-                    scope=None,
-                    config_class=None,
-                    base_model_class=None,
-                    lm_head_model_class=None,
-                    double_head_model_class=None,
-                    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.is_training = is_training
-        self.use_position_ids = use_position_ids
-        self.use_token_type_ids = use_token_type_ids
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.n_special = n_special
-        self.n_positions = n_positions
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.n_choices = n_choices
-        self.type_sequence_label_size = type_sequence_label_size
-        self.initializer_range = initializer_range
-        self.num_labels = num_labels
-        self.scope = scope
-        self.config_class = config_class
-        self.base_model_class = base_model_class
-        self.lm_head_model_class = lm_head_model_class
-        self.double_head_model_class = double_head_model_class
-        self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
-    def prepare_config_and_inputs(self):
-        total_num_tokens = self.vocab_size + self.n_special
-        input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
-        position_ids = None
-        if self.use_position_ids:
-            position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
-        token_type_ids = None
-        if self.use_token_type_ids:
-            total_voc = self.vocab_size
-            token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
-        mc_labels = None
-        lm_labels = None
-        mc_token_ids = None
-        if self.use_labels:
-            mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-            lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
-            mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
-        config = self.config_class(
-            vocab_size_or_config_json_file=self.vocab_size,
-            n_special=self.n_special,
-            n_positions=self.n_positions,
-            n_embd=self.hidden_size,
-            n_layer=self.num_hidden_layers,
-            n_head=self.num_attention_heads,
-            initializer_range=self.initializer_range)
-        return (config, input_ids, token_type_ids, position_ids,
-                mc_labels, lm_labels, mc_token_ids)
-    def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
-                            mc_labels, lm_labels, mc_token_ids):
-        model = self.base_model_class(config)
-        model.eval()
-        outputs = model(input_ids, position_ids, token_type_ids)
-        outputs = model(input_ids, position_ids)
-        outputs = model(input_ids)
-        hidden_state = outputs[0]
-        self.parent.assertListEqual(
-            list(hidden_state.size()),
-            [self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
-    def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
-                                    mc_labels, lm_labels, mc_token_ids):
-        model = self.lm_head_model_class(config)
-        model.eval()
-        outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
-        loss, lm_logits = outputs[:2]
-        total_voc = self.n_special + self.vocab_size
-        self.parent.assertListEqual(
-            list(lm_logits.size()),
-            [self.batch_size, self.n_choices, self.seq_length, total_voc])
-        self.parent.assertListEqual(
-            list(loss.size()),
-            [])
-    def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
-                                    mc_labels, lm_labels, mc_token_ids):
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.eval()
-            outputs = model(input_ids)
-            presents = outputs[-1]
-            self.parent.assertEqual(self.num_hidden_layers, len(presents))
-            self.parent.assertListEqual(
-                list(presents[0].size()),
-                [2, self.batch_size * self.n_choices, self.num_attention_heads,
-                    self.seq_length, self.hidden_size // self.num_attention_heads])
-    def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
-                                    mc_labels, lm_labels, mc_token_ids):
-        model = self.double_head_model_class(config)
-        model.eval()
-        outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
-                        token_type_ids=token_type_ids, position_ids=position_ids)
-        lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
-        loss = [lm_loss, mc_loss]
-        total_voc = self.n_special + self.vocab_size
-        self.parent.assertListEqual(
-            list(lm_logits.size()),
-            [self.batch_size, self.n_choices, self.seq_length, total_voc])
-        self.parent.assertListEqual(
-            list(mc_logits.size()),
-            [self.batch_size, self.n_choices])
-        self.parent.assertListEqual(
-            [list(l.size()) for l in loss],
-            [[], []])
-    def create_and_check_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
-            model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.parent.assertIsNotNone(model)
-    def create_and_check_commons(self, config, input_ids, token_type_ids, position_ids,
-                                    mc_labels, lm_labels, mc_token_ids):
-        inputs_dict = {'input_ids': input_ids}
-        create_and_check_commons(self, config, inputs_dict)
-    def run_common_tests(self, test_presents=False):
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_base_model(*config_and_inputs)
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_lm_head(*config_and_inputs)
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_double_heads(*config_and_inputs)
-        if test_presents:
-            config_and_inputs = self.prepare_config_and_inputs()
-            self.create_and_check_presents(*config_and_inputs)
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_commons(*config_and_inputs)
-    def run_slow_tests(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        self.create_and_check_model_from_pretrained(*config_and_inputs)
--- a/pytorch_transformers/tests/modeling_transfo_xl_test.py
+++ b/pytorch_transformers/tests/modeling_transfo_xl_test.py
@@ -28,9 +28,15 @@ import torch
 from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
 from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
+class TransfoXLModelTest(CommonTestCases.CommonModelTester):
+    all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel)
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
-class TransfoXLModelTest(unittest.TestCase):
    class TransfoXLModelTester(object):
        def __init__(self,
@@ -52,7 +58,6 @@ class TransfoXLModelTest(unittest.TestCase):
                     num_hidden_layers=5,
                     scope=None,
                     seed=1,
-                     all_model_classes=(TransfoXLModel, TransfoXLLMHeadModel),
                     ):
            self.parent = parent
            self.batch_size = batch_size
@@ -73,7 +78,6 @@ class TransfoXLModelTest(unittest.TestCase):
            self.num_hidden_layers = num_hidden_layers
            self.scope = scope
            self.seed = seed
-            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -171,16 +175,31 @@ class TransfoXLModelTest(unittest.TestCase):
                list(list(mem.size()) for mem in result["mems_2"]),
                [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-        def create_and_check_transfo_xl_commons(self, config, input_ids_1, input_ids_2, lm_labels):
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
            inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict, test_pruning=False, test_torchscript=False)
+            return config, inputs_dict
-    def test_default(self):
+    def setUp(self):
-        self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self))
+        self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
+        self.config_tester.run_common_tests()
-        config_tester.run_common_tests()
+    def test_transfo_xl_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        output_result = self.model_tester.create_transfo_xl_model(*config_and_inputs)
+        self.model_tester.check_transfo_xl_model_output(output_result)
+    def test_transfo_xl_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
+        self.model_tester.check_transfo_xl_lm_head_output(output_result)
    @pytest.mark.slow
    def test_model_from_pretrained(self):
@@ -190,23 +209,6 @@ class TransfoXLModelTest(unittest.TestCase):
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
-    def run_tester(self, tester):
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        output_result = tester.create_transfo_xl_model(*config_and_inputs)
-        tester.check_transfo_xl_model_output(output_result)
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
-        tester.check_transfo_xl_lm_head_output(output_result)
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_transfo_xl_commons(*config_and_inputs)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_utils_test.py
+++ b/pytorch_transformers/tests/modeling_utils_test.py
-# coding=utf-8
-# Copyright 2018 HuggingFace Inc..
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import unittest
-import logging
-from pytorch_transformers import PretrainedConfig, PreTrainedModel
-from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-class ModelUtilsTest(unittest.TestCase):
-    def test_model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            config = BertConfig.from_pretrained(model_name)
-            self.assertIsNotNone(config)
-            self.assertIsInstance(config, PretrainedConfig)
-            model = BertModel.from_pretrained(model_name)
-            model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(model, PreTrainedModel)
-            for value in loading_info.values():
-                self.assertEqual(len(value), 0)
-            config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
-            self.assertEqual(model.config.output_attentions, True)
-            self.assertEqual(model.config.output_hidden_states, True)
-            self.assertEqual(model.config, config)
-if __name__ == "__main__":
-    unittest.main()
--- a/pytorch_transformers/tests/modeling_xlm_test.py
+++ b/pytorch_transformers/tests/modeling_xlm_test.py
@@ -23,10 +23,15 @@ import pytest
 from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
 from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
-class XLMModelTest(unittest.TestCase):
+class XLMModelTest(CommonTestCases.CommonModelTester):
+    all_model_classes = (XLMModel, XLMWithLMHeadModel,  
+                         XLMForQuestionAnswering, XLMForSequenceClassification) 
+                         # , XLMForSequenceClassification, XLMForTokenClassification),
    class XLMModelTester(object):
        def __init__(self,
@@ -58,8 +63,6 @@ class XLMModelTest(unittest.TestCase):
                     summary_type="last",
                     use_proj=True,
                     scope=None,
-                     all_model_classes = (XLMModel, XLMWithLMHeadModel,
-                                          XLMForQuestionAnswering, XLMForSequenceClassification),  # , XLMForSequenceClassification, XLMForTokenClassification),
                    ):
            self.parent = parent
            self.batch_size = batch_size
@@ -90,7 +93,6 @@ class XLMModelTest(unittest.TestCase):
            self.num_labels = num_labels
            self.num_choices = num_choices
            self.scope = scope
-            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -237,28 +239,23 @@ class XLMModelTest(unittest.TestCase):
                [self.batch_size, self.type_sequence_label_size])
-        def create_and_check_xlm_commons(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask):
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_lengths,
+             sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths}
-            create_and_check_commons(self, config, inputs_dict)
+            return config, inputs_dict
-    def test_default(self):
+    def setUp(self):
-        self.run_tester(XLMModelTest.XLMModelTester(self))
+        self.model_tester = XLMModelTest.XLMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
+        self.config_tester.run_common_tests()
-        config_tester.run_common_tests()
-    @pytest.mark.slow
-    def test_model_from_pretrained(self):
-        cache_dir = "/tmp/pytorch_transformers_test/"
-        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
-            shutil.rmtree(cache_dir)
-            self.assertIsNotNone(model)
-    def run_tester(self, tester):
+    def test_xlm_model(self):
-        config_and_inputs = tester.prepare_config_and_inputs()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        tester.create_and_check_xlm_model(*config_and_inputs)
+        self.model_tester.create_and_check_xlm_model(*config_and_inputs)
        # config_and_inputs = tester.prepare_config_and_inputs()
        # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs)
@@ -275,8 +272,14 @@ class XLMModelTest(unittest.TestCase):
        # config_and_inputs = tester.prepare_config_and_inputs()
        # tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
-        config_and_inputs = tester.prepare_config_and_inputs()
+    @pytest.mark.slow
-        tester.create_and_check_xlm_commons(*config_and_inputs)
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
 if __name__ == "__main__":
    unittest.main()
--- a/pytorch_transformers/tests/modeling_xlnet_test.py
+++ b/pytorch_transformers/tests/modeling_xlnet_test.py
@@ -28,9 +28,14 @@ import torch
 from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
 from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
+from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
+class XLNetModelTest(CommonTestCases.CommonModelTester):
+    all_model_classes=(XLNetModel, XLNetLMHeadModel,
+                    XLNetForSequenceClassification, XLNetForQuestionAnswering)
+    test_pruning = False
-class XLNetModelTest(unittest.TestCase):
    class XLNetModelTester(object):
        def __init__(self,
@@ -56,8 +61,6 @@ class XLNetModelTest(unittest.TestCase):
                     initializer_range=0.05,
                     seed=1,
                     type_vocab_size=2,
-                     all_model_classes=(XLNetModel, XLNetLMHeadModel,
-                                        XLNetForSequenceClassification, XLNetForQuestionAnswering),
            ):
            self.parent = parent
            self.batch_size = batch_size
@@ -82,7 +85,6 @@ class XLNetModelTest(unittest.TestCase):
            self.seed = seed
            self.type_vocab_size = type_vocab_size
            self.type_sequence_label_size = type_sequence_label_size
-            self.all_model_classes = all_model_classes
        def prepare_config_and_inputs(self):
            input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
@@ -264,17 +266,41 @@ class XLNetModelTest(unittest.TestCase):
                list(list(mem.size()) for mem in result["mems_1"]),
                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
-        def create_and_check_xlnet_commons(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+        def prepare_config_and_inputs_for_common(self):
-                target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
+                target_mapping, inp_q, segment_ids, lm_labels,
+                sequence_labels, is_impossible_labels) = config_and_inputs
            inputs_dict = {'input_ids': input_ids_1}
-            create_and_check_commons(self, config, inputs_dict, test_pruning=False)
+            return config, inputs_dict
-    def test_default(self):
+    def setUp(self):
-        self.run_tester(XLNetModelTest.XLNetModelTester(self))
+        self.model_tester = XLNetModelTest.XLNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
    def test_config(self):
-        config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
+        self.config_tester.run_common_tests()
-        config_tester.run_common_tests()
+    def test_xlnet_base_model(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
+    def test_xlnet_lm_head(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
+    def test_xlnet_sequence_classif(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
+    def test_xlnet_qa(self):
+        self.model_tester.set_seed()
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
    @pytest.mark.slow
    def test_model_from_pretrained(self):
@@ -284,27 +310,6 @@ class XLNetModelTest(unittest.TestCase):
            shutil.rmtree(cache_dir)
            self.assertIsNotNone(model)
-    def run_tester(self, tester):
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_base_model(*config_and_inputs)
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_lm_head(*config_and_inputs) 
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_qa(*config_and_inputs)
-        tester.set_seed()
-        config_and_inputs = tester.prepare_config_and_inputs()
-        tester.create_and_check_xlnet_commons(*config_and_inputs)
 if __name__ == "__main__":
    unittest.main()