Unverified Commit 292140b9 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #781 from huggingface/embeddings

Clean up input embeddings resizing and weights tying
parents 3821ecbf c57e9d94
...@@ -507,23 +507,17 @@ class BertPredictionHeadTransform(nn.Module): ...@@ -507,23 +507,17 @@ class BertPredictionHeadTransform(nn.Module):
class BertLMPredictionHead(nn.Module): class BertLMPredictionHead(nn.Module):
def __init__(self, config, bert_model_embedding_weights): def __init__(self, config):
super(BertLMPredictionHead, self).__init__() super(BertLMPredictionHead, self).__init__()
self.transform = BertPredictionHeadTransform(config) self.transform = BertPredictionHeadTransform(config)
self.torchscript = config.torchscript
# The output weights are the same as the input embeddings, but there is # The output weights are the same as the input embeddings, but there is
# an output-only bias for each token. # an output-only bias for each token.
self.decoder = nn.Linear(bert_model_embedding_weights.size(1), self.decoder = nn.Linear(config.hidden_size,
bert_model_embedding_weights.size(0), config.vocab_size,
bias=False) bias=False)
if self.torchscript: self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone())
else:
self.decoder.weight = bert_model_embedding_weights
self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
def forward(self, hidden_states): def forward(self, hidden_states):
hidden_states = self.transform(hidden_states) hidden_states = self.transform(hidden_states)
...@@ -532,9 +526,9 @@ class BertLMPredictionHead(nn.Module): ...@@ -532,9 +526,9 @@ class BertLMPredictionHead(nn.Module):
class BertOnlyMLMHead(nn.Module): class BertOnlyMLMHead(nn.Module):
def __init__(self, config, bert_model_embedding_weights): def __init__(self, config):
super(BertOnlyMLMHead, self).__init__() super(BertOnlyMLMHead, self).__init__()
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) self.predictions = BertLMPredictionHead(config)
def forward(self, sequence_output): def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output) prediction_scores = self.predictions(sequence_output)
...@@ -552,9 +546,9 @@ class BertOnlyNSPHead(nn.Module): ...@@ -552,9 +546,9 @@ class BertOnlyNSPHead(nn.Module):
class BertPreTrainingHeads(nn.Module): class BertPreTrainingHeads(nn.Module):
def __init__(self, config, bert_model_embedding_weights): def __init__(self, config):
super(BertPreTrainingHeads, self).__init__() super(BertPreTrainingHeads, self).__init__()
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) self.predictions = BertLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2) self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, sequence_output, pooled_output): def forward(self, sequence_output, pooled_output):
...@@ -619,6 +613,12 @@ class BertModel(BertPreTrainedModel): ...@@ -619,6 +613,12 @@ class BertModel(BertPreTrainedModel):
self.apply(self.init_weights) self.apply(self.init_weights)
def _resize_token_embeddings(self, new_num_tokens):
old_embeddings = self.embeddings.word_embeddings
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
self.embeddings.word_embeddings = new_embeddings
return self.embeddings.word_embeddings
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """ Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
...@@ -750,9 +750,17 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -750,9 +750,17 @@ class BertForPreTraining(BertPreTrainedModel):
super(BertForPreTraining, self).__init__(config) super(BertForPreTraining, self).__init__(config)
self.bert = BertModel(config) self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) self.cls = BertPreTrainingHeads(config)
self.apply(self.init_weights) self.apply(self.init_weights)
self.tie_weights()
def tie_weights(self):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self._tie_or_clone_weights(self.cls.predictions.decoder,
self.bert.embeddings.word_embeddings)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
next_sentence_label=None, head_mask=None): next_sentence_label=None, head_mask=None):
...@@ -845,9 +853,17 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -845,9 +853,17 @@ class BertForMaskedLM(BertPreTrainedModel):
super(BertForMaskedLM, self).__init__(config) super(BertForMaskedLM, self).__init__(config)
self.bert = BertModel(config) self.bert = BertModel(config)
self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) self.cls = BertOnlyMLMHead(config)
self.apply(self.init_weights) self.apply(self.init_weights)
self.tie_weights()
def tie_weights(self):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
self._tie_or_clone_weights(self.cls.predictions.decoder,
self.bert.embeddings.word_embeddings)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None): def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
""" """
......
...@@ -104,7 +104,6 @@ class GPT2Config(PretrainedConfig): ...@@ -104,7 +104,6 @@ class GPT2Config(PretrainedConfig):
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. n_embd: Dimensionality of the embeddings and hidden states.
...@@ -119,14 +118,12 @@ class GPT2Config(PretrainedConfig): ...@@ -119,14 +118,12 @@ class GPT2Config(PretrainedConfig):
embd_pdrop: The dropout ratio for the embeddings. embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices. initializing all weight matrices.
predict_special_tokens: should we predict special tokens (when the model has a LM head)
""" """
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__( def __init__(
self, self,
vocab_size_or_config_json_file=50257, vocab_size_or_config_json_file=50257,
n_special=0,
n_positions=1024, n_positions=1024,
n_ctx=1024, n_ctx=1024,
n_embd=768, n_embd=768,
...@@ -137,7 +134,6 @@ class GPT2Config(PretrainedConfig): ...@@ -137,7 +134,6 @@ class GPT2Config(PretrainedConfig):
attn_pdrop=0.1, attn_pdrop=0.1,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
predict_special_tokens=True,
num_labels=1, num_labels=1,
summary_type='token_ids', summary_type='token_ids',
...@@ -151,7 +147,6 @@ class GPT2Config(PretrainedConfig): ...@@ -151,7 +147,6 @@ class GPT2Config(PretrainedConfig):
Args: Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
n_positions: Number of positional embeddings. n_positions: Number of positional embeddings.
n_ctx: Size of the causal mask (usually same as n_positions). n_ctx: Size of the causal mask (usually same as n_positions).
n_embd: Dimensionality of the embeddings and hidden states. n_embd: Dimensionality of the embeddings and hidden states.
...@@ -166,7 +161,6 @@ class GPT2Config(PretrainedConfig): ...@@ -166,7 +161,6 @@ class GPT2Config(PretrainedConfig):
embd_pdrop: The dropout ratio for the embeddings. embd_pdrop: The dropout ratio for the embeddings.
initializer_range: The sttdev of the truncated_normal_initializer for initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices. initializing all weight matrices.
predict_special_tokens: should we predict special tokens (when the model has a LM head)
""" """
super(GPT2Config, self).__init__(**kwargs) super(GPT2Config, self).__init__(**kwargs)
...@@ -178,7 +172,6 @@ class GPT2Config(PretrainedConfig): ...@@ -178,7 +172,6 @@ class GPT2Config(PretrainedConfig):
self.__dict__[key] = value self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int): elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file self.vocab_size = vocab_size_or_config_json_file
self.n_special = n_special
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
self.n_embd = n_embd self.n_embd = n_embd
...@@ -189,7 +182,6 @@ class GPT2Config(PretrainedConfig): ...@@ -189,7 +182,6 @@ class GPT2Config(PretrainedConfig):
self.attn_pdrop = attn_pdrop self.attn_pdrop = attn_pdrop
self.layer_norm_epsilon = layer_norm_epsilon self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range self.initializer_range = initializer_range
self.predict_special_tokens = predict_special_tokens
self.num_labels = num_labels self.num_labels = num_labels
self.summary_type = summary_type self.summary_type = summary_type
...@@ -203,10 +195,6 @@ class GPT2Config(PretrainedConfig): ...@@ -203,10 +195,6 @@ class GPT2Config(PretrainedConfig):
"or the path to a pretrained model config file (str)" "or the path to a pretrained model config file (str)"
) )
@property
def total_tokens_embeddings(self):
return self.vocab_size + self.n_special
@property @property
def hidden_size(self): def hidden_size(self):
return self.n_embd return self.n_embd
...@@ -347,34 +335,6 @@ class Block(nn.Module): ...@@ -347,34 +335,6 @@ class Block(nn.Module):
return outputs # x, present, (attentions) return outputs # x, present, (attentions)
class GPT2LMHead(nn.Module):
""" Language Model Head for the transformer """
def __init__(self, model_embeddings_weights, config):
super(GPT2LMHead, self).__init__()
self.n_embd = config.n_embd
self.vocab_size = config.vocab_size
self.predict_special_tokens = config.predict_special_tokens
self.torchscript = config.torchscript
embed_shape = model_embeddings_weights.shape
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
self.set_embeddings_weights(model_embeddings_weights)
def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
self.predict_special_tokens = predict_special_tokens
# Export to TorchScript can't handle parameter sharing so we are cloning them.
if self.torchscript:
self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
else:
self.decoder.weight = model_embeddings_weights # Tied weights
def forward(self, hidden_state):
lm_logits = self.decoder(hidden_state)
if not self.predict_special_tokens:
lm_logits = lm_logits[..., :self.vocab_size]
return lm_logits
class GPT2PreTrainedModel(PreTrainedModel): class GPT2PreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and """ An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models. a simple interface for dowloading and loading pretrained models.
...@@ -400,36 +360,6 @@ class GPT2PreTrainedModel(PreTrainedModel): ...@@ -400,36 +360,6 @@ class GPT2PreTrainedModel(PreTrainedModel):
module.bias.data.zero_() module.bias.data.zero_()
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
"""
Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `gpt2`
- a path or url to a pretrained model archive containing:
. `gpt2_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
- a path or url to a pretrained model archive containing:
. `gpt2_config.json` a configuration file for the model
. a TensorFlow checkpoint with trained weights
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
*inputs, **kwargs: additional input for the specific GPT2 class
"""
num_special_tokens = kwargs.pop('num_special_tokens', None)
model = super(GPT2PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
# Add additional embeddings for special tokens if needed
# This step also make sure we are still sharing the output and input embeddings after loading weights
model.set_num_special_tokens(num_special_tokens)
return model
class GPT2Model(GPT2PreTrainedModel): class GPT2Model(GPT2PreTrainedModel):
"""OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners"). """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
...@@ -447,13 +377,13 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -447,13 +377,13 @@ class GPT2Model(GPT2PreTrainedModel):
config.vocab_size - 1, ______________________ config.vocab_size - 1, ______________________
config.vocab_size, config.vocab_size,
... -> special embeddings ... -> special embeddings
config.vocab_size + config.n_special - 1] ______________________ config.vocab_size + n_special - 1] ______________________
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to where total_tokens_embeddings is equal to
:: ::
total_tokens_embeddings = config.vocab_size + config.n_special total_tokens_embeddings = vocab_size + n_special
You should use the associated indices to index the embeddings. You should use the associated indices to index the embeddings.
...@@ -474,7 +404,7 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -474,7 +404,7 @@ class GPT2Model(GPT2PreTrainedModel):
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd) self.wte = nn.Embedding(config.vocab_size, config.n_embd)
self.wpe = nn.Embedding(config.n_positions, config.n_embd) self.wpe = nn.Embedding(config.n_positions, config.n_embd)
self.drop = nn.Dropout(config.embd_pdrop) self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
...@@ -482,26 +412,9 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -482,26 +412,9 @@ class GPT2Model(GPT2PreTrainedModel):
self.apply(self.init_weights) self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens=None): def _resize_token_embeddings(self, new_num_tokens):
""" self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
Update input embeddings with new embedding matrix if needed. return self.wte
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
TODO Lysandre filled args
"""
if num_special_tokens is None or self.config.n_special == num_special_tokens:
return
# Update config
self.config.n_special = num_special_tokens
# Build new embeddings and initialize all new embeddings (in particular the special tokens)
old_embed = self.wte
self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
self.wte.to(old_embed.weight.device)
self.init_weights(self.wte)
# Copy word embeddings from the previous weights
self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """ Prunes heads of the model.
...@@ -641,23 +554,17 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -641,23 +554,17 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super(GPT2LMHeadModel, self).__init__(config) super(GPT2LMHeadModel, self).__init__(config)
self.transformer = GPT2Model(config) self.transformer = GPT2Model(config)
self.lm_head = GPT2LMHead(self.transformer.wte.weight, config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
"""
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
Args: self.apply(self.init_weights)
num_special_tokens: Special tokens to be added to the embedding matrix self.tie_weights()
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled args def tie_weights(self):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
""" """
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self._tie_or_clone_weights(self.lm_head,
self.transformer.set_num_special_tokens(num_special_tokens) self.transformer.wte)
self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None): def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
""" """
...@@ -740,25 +647,17 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -740,25 +647,17 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
def __init__(self, config): def __init__(self, config):
super(GPT2DoubleHeadsModel, self).__init__(config) super(GPT2DoubleHeadsModel, self).__init__(config)
self.transformer = GPT2Model(config) self.transformer = GPT2Model(config)
self.lm_head = GPT2LMHead(self.transformer.wte.weight, config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.multiple_choice_head = SequenceSummary(config) self.multiple_choice_head = SequenceSummary(config)
self.apply(self.init_weights) self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): def tie_weights(self):
""" """ Make sure we are sharing the input and output embeddings.
Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings Export to TorchScript can't handle parameter sharing so we are cloning them instead.
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled args
""" """
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self._tie_or_clone_weights(self.lm_head,
self.transformer.set_num_special_tokens(num_special_tokens) self.transformer.wte)
self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None, def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
position_ids=None, past=None, head_mask=None): position_ids=None, past=None, head_mask=None):
......
...@@ -156,7 +156,6 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -156,7 +156,6 @@ class OpenAIGPTConfig(PretrainedConfig):
def __init__( def __init__(
self, self,
vocab_size_or_config_json_file=40478, vocab_size_or_config_json_file=40478,
n_special=0,
n_positions=512, n_positions=512,
n_ctx=512, n_ctx=512,
n_embd=768, n_embd=768,
...@@ -190,7 +189,6 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -190,7 +189,6 @@ class OpenAIGPTConfig(PretrainedConfig):
self.__dict__[key] = value self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int): elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file self.vocab_size = vocab_size_or_config_json_file
self.n_special = n_special
self.n_ctx = n_ctx self.n_ctx = n_ctx
self.n_positions = n_positions self.n_positions = n_positions
self.n_embd = n_embd self.n_embd = n_embd
...@@ -216,10 +214,6 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -216,10 +214,6 @@ class OpenAIGPTConfig(PretrainedConfig):
"or the path to a pretrained model config file (str)" "or the path to a pretrained model config file (str)"
) )
@property
def total_tokens_embeddings(self):
return self.vocab_size + self.n_special
@property @property
def hidden_size(self): def hidden_size(self):
return self.n_embd return self.n_embd
...@@ -355,34 +349,6 @@ class Block(nn.Module): ...@@ -355,34 +349,6 @@ class Block(nn.Module):
return outputs return outputs
class OpenAIGPTLMHead(nn.Module):
""" Language Model Head for the transformer """
def __init__(self, model_embeddings_weights, config):
super(OpenAIGPTLMHead, self).__init__()
self.n_embd = config.n_embd
self.vocab_size = config.vocab_size
self.predict_special_tokens = config.predict_special_tokens
self.torchscript = config.torchscript
embed_shape = model_embeddings_weights.shape
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
self.set_embeddings_weights(model_embeddings_weights)
def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
self.predict_special_tokens = predict_special_tokens
if self.torchscript:
self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
else:
self.decoder.weight = model_embeddings_weights # Tied weights
def forward(self, hidden_state):
lm_logits = self.decoder(hidden_state)
if not self.predict_special_tokens:
lm_logits = lm_logits[..., :self.vocab_size]
return lm_logits
class OpenAIGPTPreTrainedModel(PreTrainedModel): class OpenAIGPTPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and """ An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models. a simple interface for dowloading and loading pretrained models.
...@@ -408,36 +374,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel): ...@@ -408,36 +374,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
module.bias.data.zero_() module.bias.data.zero_()
module.weight.data.fill_(1.0) module.weight.data.fill_(1.0)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
"""
Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
pretrained_model_name_or_path: either:
- a str with the name of a pre-trained model to load selected in the list of:
- a path or url to a pretrained model archive containing:
. `config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
- a path or url to a pretrained model archive containing:
. `config.json` a configuration file for the model
. a series of NumPy files containing OpenAI TensorFlow trained weights
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
*inputs, **kwargs: additional input for the specific OpenAI-GPT class
"""
num_special_tokens = kwargs.get('num_special_tokens', None)
kwargs.pop('num_special_tokens', None)
model = super(OpenAIGPTPreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs)
# Add additional embeddings for special tokens if needed
# This step also make sure we are still sharing the output and input embeddings after loading weights
model.set_num_special_tokens(num_special_tokens)
return model
class OpenAIGPTModel(OpenAIGPTPreTrainedModel): class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
"""OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training"). """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
...@@ -457,13 +393,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -457,13 +393,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
config.vocab_size - 1, ______________________ config.vocab_size - 1, ______________________
config.vocab_size, config.vocab_size,
... -> special embeddings ... -> special embeddings
config.vocab_size + config.n_special - 1] ______________________ config.vocab_size + n_special - 1] ______________________
where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is: where ``total_tokens_embeddings`` is:
:: ::
total_tokens_embeddings = config.vocab_size + config.n_special total_tokens_embeddings = config.vocab_size + n_special
You should use the associated indices to index the embeddings. You should use the associated indices to index the embeddings.
...@@ -485,34 +421,16 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -485,34 +421,16 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states self.output_hidden_states = config.output_hidden_states
self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd) self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
self.positions_embed = nn.Embedding(config.n_positions, config.n_embd) self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
self.drop = nn.Dropout(config.embd_pdrop) self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
self.apply(self.init_weights) self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens=None): def _resize_token_embeddings(self, new_num_tokens):
""" self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
Update input embeddings with new embedding matrice if needed return self.tokens_embed
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
TODO Lysandre filled Args
"""
if num_special_tokens is None or self.config.n_special == num_special_tokens:
return
# Update config
self.config.n_special = num_special_tokens
# Build new embeddings and initialize all new embeddings (in particular the special tokens)
old_embed = self.tokens_embed
self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
self.tokens_embed.to(old_embed.weight.device)
self.init_weights(self.tokens_embed)
# Copy word embeddings from the previous weights
self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """ Prunes heads of the model.
...@@ -657,24 +575,17 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -657,24 +575,17 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
def __init__(self, config): def __init__(self, config):
super(OpenAIGPTLMHeadModel, self).__init__(config) super(OpenAIGPTLMHeadModel, self).__init__(config)
self.transformer = OpenAIGPTModel(config) self.transformer = OpenAIGPTModel(config)
self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
"""
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
Args: self.apply(self.init_weights)
num_special_tokens: Special tokens to be added to the embedding matrix self.tie_weights()
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled Args
def tie_weights(self):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
""" """
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self._tie_or_clone_weights(self.lm_head,
self.transformer.set_num_special_tokens(num_special_tokens) self.transformer.tokens_embed)
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None): def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
""" """
...@@ -747,13 +658,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -747,13 +658,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
config.vocab_size - 1, ______________________ config.vocab_size - 1, ______________________
config.vocab_size, config.vocab_size,
... -> special embeddings ... -> special embeddings
config.vocab_size + config.n_special - 1] ______________________ config.vocab_size + n_special - 1] ______________________
where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is: where ``total_tokens_embeddings`` is:
:: ::
total_tokens_embeddings = config.vocab_size + config.n_special total_tokens_embeddings = config.vocab_size + .n_special
You should use the associate indices to index the embeddings. You should use the associate indices to index the embeddings.
...@@ -773,24 +684,18 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -773,24 +684,18 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
super(OpenAIGPTDoubleHeadsModel, self).__init__(config) super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
self.transformer = OpenAIGPTModel(config) self.transformer = OpenAIGPTModel(config)
self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
self.multiple_choice_head = SequenceSummary(config) self.multiple_choice_head = SequenceSummary(config)
self.apply(self.init_weights) self.apply(self.init_weights)
self.tie_weights()
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): def tie_weights(self):
""" Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings. """ Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled Args
""" """
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self._tie_or_clone_weights(self.lm_head,
self.transformer.set_num_special_tokens(num_special_tokens) self.transformer.tokens_embed)
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None, def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
position_ids=None, head_mask=None): position_ids=None, head_mask=None):
......
...@@ -287,6 +287,14 @@ class TransfoXLConfig(PretrainedConfig): ...@@ -287,6 +287,14 @@ class TransfoXLConfig(PretrainedConfig):
raise ValueError("First argument must be either a vocabulary size (int)" raise ValueError("First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)") "or the path to a pretrained model config file (str)")
@property
def vocab_size(self):
return self.n_token
@vocab_size.setter
def vocab_size(self, value):
self.n_token = value
@property @property
def hidden_size(self): def hidden_size(self):
return self.d_model return self.d_model
...@@ -998,6 +1006,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -998,6 +1006,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
self.apply(self.init_weights) self.apply(self.init_weights)
def _resize_token_embeddings(self, new_num_tokens):
return self.word_emb
def backward_compatible(self): def backward_compatible(self):
self.sample_softmax = -1 self.sample_softmax = -1
...@@ -1273,13 +1284,20 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -1273,13 +1284,20 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
else: else:
if self.config.tie_weight: if self.config.tie_weight:
for i in range(len(self.crit.out_layers)): for i in range(len(self.crit.out_layers)):
self.crit.out_layers[i].weight = self.transformer.word_emb.emb_layers[i].weight self._tie_or_clone_weights(self.crit.out_layers[i],
self.transformer.word_emb.emb_layers[i])
if self.config.tie_projs: if self.config.tie_projs:
for i, tie_proj in enumerate(self.config.tie_projs): for i, tie_proj in enumerate(self.config.tie_projs):
if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed: if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0] if self.config.torchscript:
self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
else:
self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
elif tie_proj and self.config.div_val != 1: elif tie_proj and self.config.div_val != 1:
self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i] if self.config.torchscript:
self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
else:
self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
def reset_length(self, tgt_len, ext_len, mem_len): def reset_length(self, tgt_len, ext_len, mem_len):
self.transformer.reset_length(tgt_len, ext_len, mem_len) self.transformer.reset_length(tgt_len, ext_len, mem_len)
......
...@@ -151,6 +151,7 @@ class PreTrainedModel(nn.Module): ...@@ -151,6 +151,7 @@ class PreTrainedModel(nn.Module):
pretrained_model_archive_map = {} pretrained_model_archive_map = {}
load_tf_weights = lambda model, config, path: None load_tf_weights = lambda model, config, path: None
base_model_prefix = "" base_model_prefix = ""
input_embeddings = None
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(PreTrainedModel, self).__init__() super(PreTrainedModel, self).__init__()
...@@ -164,12 +165,79 @@ class PreTrainedModel(nn.Module): ...@@ -164,12 +165,79 @@ class PreTrainedModel(nn.Module):
# Save config in model # Save config in model
self.config = config self.config = config
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
""" Build a resized Embedding Module from a provided token Embedding Module.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
Args:
new_num_tokens: (Optional) New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
If not provided or None: return the provided token Embedding Module.
Return:
Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
"""
if new_num_tokens is None:
return old_embeddings
old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
if old_num_tokens == new_num_tokens:
return old_embeddings
# Build new embeddings
new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
new_embeddings.to(old_embeddings.weight.device)
# initialize all new embeddings (in particular added tokens)
self.init_weights(new_embeddings)
# Copy word embeddings from the previous weights
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
return new_embeddings
def _tie_or_clone_weights(self, first_module, second_module):
""" Tie or clone module weights depending of weither we are using TorchScript or not
"""
if self.config.torchscript:
first_module.weight = nn.Parameter(second_module.weight.clone())
else:
first_module.weight = second_module.weight
def resize_token_embeddings(self, new_num_tokens=None):
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
Args:
new_num_tokens: (Optional) New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
If not provided or None: does nothing.
Return:
Pointer to the input tokens Embedding Module of the model
"""
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
model_embeds = base_model._resize_token_embeddings(new_num_tokens)
if new_num_tokens is None:
return model_embeds
# Update base model and current model config
self.config.vocab_size = new_num_tokens
base_model.vocab_size = new_num_tokens
# Tie weights again if needed
if hasattr(self, 'tie_weights'):
self.tie_weights()
return model_embeds
def prune_heads(self, heads_to_prune): def prune_heads(self, heads_to_prune):
""" Prunes heads of the base model. """ Prunes heads of the base model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
""" """
model_to_prune = getattr(self, self.base_model_prefix, self) # get the base model if needed base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
model_to_prune._prune_heads(heads_to_prune) base_model._prune_heads(heads_to_prune)
def save_pretrained(self, save_directory): def save_pretrained(self, save_directory):
""" Save a model with its configuration file to a directory, so that it """ Save a model with its configuration file to a directory, so that it
......
...@@ -104,7 +104,6 @@ class XLMConfig(PretrainedConfig): ...@@ -104,7 +104,6 @@ class XLMConfig(PretrainedConfig):
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=30145, vocab_size_or_config_json_file=30145,
n_special=0,
emb_dim=2048, emb_dim=2048,
n_layers=12, n_layers=12,
n_heads=16, n_heads=16,
...@@ -148,7 +147,6 @@ class XLMConfig(PretrainedConfig): ...@@ -148,7 +147,6 @@ class XLMConfig(PretrainedConfig):
self.__dict__[key] = value self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int): elif isinstance(vocab_size_or_config_json_file, int):
self.n_words = vocab_size_or_config_json_file self.n_words = vocab_size_or_config_json_file
self.n_special = n_special
self.emb_dim = emb_dim self.emb_dim = emb_dim
self.n_layers = n_layers self.n_layers = n_layers
self.n_heads = n_heads self.n_heads = n_heads
...@@ -183,8 +181,12 @@ class XLMConfig(PretrainedConfig): ...@@ -183,8 +181,12 @@ class XLMConfig(PretrainedConfig):
"or the path to a pretrained model config file (str)") "or the path to a pretrained model config file (str)")
@property @property
def total_tokens_embeddings(self): def vocab_size(self):
return self.n_words + self.n_special return self.n_words
@vocab_size.setter
def vocab_size(self, value):
self.n_words = value
@property @property
def hidden_size(self): def hidden_size(self):
...@@ -479,6 +481,10 @@ class XLMModel(XLMPreTrainedModel): ...@@ -479,6 +481,10 @@ class XLMModel(XLMPreTrainedModel):
self.apply(self.init_weights) self.apply(self.init_weights)
def _resize_token_embeddings(self, new_num_tokens):
self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
return self.embeddings
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model. """ Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
...@@ -718,8 +724,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -718,8 +724,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(XLMWithLMHeadModel, self).__init__(config) super(XLMWithLMHeadModel, self).__init__(config)
self.torchscript = config.torchscript
self.transformer = XLMModel(config) self.transformer = XLMModel(config)
self.pred_layer = XLMPredLayer(config) self.pred_layer = XLMPredLayer(config)
...@@ -729,10 +733,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -729,10 +733,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
def tie_weights(self): def tie_weights(self):
""" Make sure we are sharing the embeddings """ Make sure we are sharing the embeddings
""" """
if self.torchscript: self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
else:
self.pred_layer.proj.weight = self.transformer.embeddings.weight
def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None, def forward(self, input_ids, lengths=None, positions=None, langs=None, token_type_ids=None,
attention_mask=None, cache=None, labels=None, head_mask=None): attention_mask=None, cache=None, labels=None, head_mask=None):
......
...@@ -312,6 +312,14 @@ class XLNetConfig(PretrainedConfig): ...@@ -312,6 +312,14 @@ class XLNetConfig(PretrainedConfig):
raise ValueError("First argument must be either a vocabulary size (int)" raise ValueError("First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)") "or the path to a pretrained model config file (str)")
@property
def vocab_size(self):
return self.n_token
@vocab_size.setter
def vocab_size(self, value):
self.n_token = value
@property @property
def hidden_size(self): def hidden_size(self):
return self.d_model return self.d_model
...@@ -654,9 +662,12 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -654,9 +662,12 @@ class XLNetModel(XLNetPreTrainedModel):
self.apply(self.init_weights) self.apply(self.init_weights)
def _resize_token_embeddings(self, new_num_tokens):
self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
return self.word_embedding
def _prune_heads(self, heads_to_prune): def _prune_heads(self, heads_to_prune):
logger.info("Head pruning is not implemented for XLNet") raise NotImplementedError
pass
def create_mask(self, qlen, mlen): def create_mask(self, qlen, mlen):
""" """
...@@ -970,23 +981,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -970,23 +981,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
super(XLNetLMHeadModel, self).__init__(config) super(XLNetLMHeadModel, self).__init__(config)
self.attn_type = config.attn_type self.attn_type = config.attn_type
self.same_length = config.same_length self.same_length = config.same_length
self.torchscript = config.torchscript
self.transformer = XLNetModel(config) self.transformer = XLNetModel(config)
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True) self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
# Tie weights
self.apply(self.init_weights) self.apply(self.init_weights)
self.tie_weights() self.tie_weights()
def tie_weights(self): def tie_weights(self):
""" Make sure we are sharing the embeddings """ Make sure we are sharing the embeddings
""" """
if self.torchscript: self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
else:
self.lm_loss.weight = self.transformer.word_embedding.weight
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
mems=None, perm_mask=None, target_mapping=None, inp_q=None, mems=None, perm_mask=None, target_mapping=None, inp_q=None,
......
...@@ -26,10 +26,15 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM, ...@@ -26,10 +26,15 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
BertForTokenClassification, BertForMultipleChoice) BertForTokenClassification, BertForMultipleChoice)
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor) from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
class BertModelTest(unittest.TestCase): class BertModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification)
class BertModelTester(object): class BertModelTester(object):
def __init__(self, def __init__(self,
...@@ -55,9 +60,6 @@ class BertModelTest(unittest.TestCase): ...@@ -55,9 +60,6 @@ class BertModelTest(unittest.TestCase):
num_labels=3, num_labels=3,
num_choices=4, num_choices=4,
scope=None, scope=None,
all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification),
): ):
self.parent = parent self.parent = parent
self.batch_size = batch_size self.batch_size = batch_size
...@@ -81,7 +83,6 @@ class BertModelTest(unittest.TestCase): ...@@ -81,7 +83,6 @@ class BertModelTest(unittest.TestCase):
self.num_labels = num_labels self.num_labels = num_labels
self.num_choices = num_choices self.num_choices = num_choices
self.scope = scope self.scope = scope
self.all_model_classes = all_model_classes
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
...@@ -253,52 +254,59 @@ class BertModelTest(unittest.TestCase): ...@@ -253,52 +254,59 @@ class BertModelTest(unittest.TestCase):
self.check_loss_output(result) self.check_loss_output(result)
def create_and_check_bert_commons(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, token_type_ids, input_mask,
sequence_labels, token_labels, choice_labels) = config_and_inputs
inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
create_and_check_commons(self, config, inputs_dict) return config, inputs_dict
def test_default(self): def setUp(self):
self.run_tester(BertModelTest.BertModelTester(self)) self.model_tester = BertModelTest.BertModelTester(self)
self.config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37)
def test_config(self): def test_config(self):
config_tester = ConfigTester(self, config_class=BertConfig, hidden_size=37) self.config_tester.run_common_tests()
config_tester.run_common_tests()
@pytest.mark.slow def test_bert_model(self):
def test_model_from_pretrained(self): config_and_inputs = self.model_tester.prepare_config_and_inputs()
cache_dir = "/tmp/pytorch_transformers_test/" self.model_tester.create_and_check_bert_model(*config_and_inputs)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
def run_tester(self, tester): def test_for_masked_lm(self):
config_and_inputs = tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
tester.create_and_check_bert_model(*config_and_inputs) self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
config_and_inputs = tester.prepare_config_and_inputs() def test_for_multiple_choice(self):
tester.create_and_check_bert_for_masked_lm(*config_and_inputs) config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
config_and_inputs = tester.prepare_config_and_inputs() def test_for_next_sequence_prediction(self):
tester.create_and_check_bert_for_multiple_choice(*config_and_inputs) config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs)
config_and_inputs = tester.prepare_config_and_inputs() def test_for_pretraining(self):
tester.create_and_check_bert_for_next_sequence_prediction(*config_and_inputs) config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_pretraining(*config_and_inputs)
config_and_inputs = tester.prepare_config_and_inputs() def test_for_question_answering(self):
tester.create_and_check_bert_for_pretraining(*config_and_inputs) config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_question_answering(*config_and_inputs)
config_and_inputs = tester.prepare_config_and_inputs() def test_for_sequence_classification(self):
tester.create_and_check_bert_for_question_answering(*config_and_inputs) config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_sequence_classification(*config_and_inputs)
config_and_inputs = tester.prepare_config_and_inputs() def test_for_token_classification(self):
tester.create_and_check_bert_for_sequence_classification(*config_and_inputs) config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
config_and_inputs = tester.prepare_config_and_inputs() @pytest.mark.slow
tester.create_and_check_bert_for_token_classification(*config_and_inputs) def test_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_transformers_test/"
config_and_inputs = tester.prepare_config_and_inputs() for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
tester.create_and_check_bert_commons(*config_and_inputs) model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
This diff is collapsed.
...@@ -16,19 +16,14 @@ from __future__ import absolute_import ...@@ -16,19 +16,14 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import os
import unittest import unittest
import json
import random
import shutil
import pytest import pytest
import torch
from pytorch_transformers import (GPT2Config, GPT2Model, from pytorch_transformers import (GPT2Config, GPT2Model,
GPT2LMHeadModel, GPT2DoubleHeadsModel) GPT2LMHeadModel, GPT2DoubleHeadsModel)
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester) from .modeling_common_test import CommonTestCases, ConfigTester
class GPT2ModelTest(unittest.TestCase): class GPT2ModelTest(unittest.TestCase):
...@@ -37,14 +32,14 @@ class GPT2ModelTest(unittest.TestCase): ...@@ -37,14 +32,14 @@ class GPT2ModelTest(unittest.TestCase):
config_tester.run_common_tests() config_tester.run_common_tests()
def test_model(self): def test_model(self):
model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model, model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
lm_head_model_class=GPT2LMHeadModel, lm_head_model_class=GPT2LMHeadModel,
double_head_model_class=GPT2DoubleHeadsModel) double_head_model_class=GPT2DoubleHeadsModel)
model_tester.run_common_tests(test_presents=True) model_tester.run_common_tests(test_presents=True)
@pytest.mark.slow @pytest.mark.slow
def test_pretrained(self): def test_pretrained(self):
model_tester = GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model, model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model,
lm_head_model_class=GPT2LMHeadModel, lm_head_model_class=GPT2LMHeadModel,
double_head_model_class=GPT2DoubleHeadsModel) double_head_model_class=GPT2DoubleHeadsModel)
model_tester.run_slow_tests() model_tester.run_slow_tests()
......
...@@ -19,12 +19,11 @@ from __future__ import print_function ...@@ -19,12 +19,11 @@ from __future__ import print_function
import unittest import unittest
import pytest import pytest
import torch
from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester) from .modeling_common_test import CommonTestCases, ConfigTester
class OpenAIModelTest(unittest.TestCase): class OpenAIModelTest(unittest.TestCase):
...@@ -33,14 +32,14 @@ class OpenAIModelTest(unittest.TestCase): ...@@ -33,14 +32,14 @@ class OpenAIModelTest(unittest.TestCase):
config_tester.run_common_tests() config_tester.run_common_tests()
def test_model(self): def test_model(self):
model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel, model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
lm_head_model_class=OpenAIGPTLMHeadModel, lm_head_model_class=OpenAIGPTLMHeadModel,
double_head_model_class=OpenAIGPTDoubleHeadsModel) double_head_model_class=OpenAIGPTDoubleHeadsModel)
model_tester.run_common_tests(test_presents=False) model_tester.run_common_tests(test_presents=False)
@pytest.mark.slow @pytest.mark.slow
def test_pretrained(self): def test_pretrained(self):
model_tester = GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel, model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel,
lm_head_model_class=OpenAIGPTLMHeadModel, lm_head_model_class=OpenAIGPTLMHeadModel,
double_head_model_class=OpenAIGPTDoubleHeadsModel) double_head_model_class=OpenAIGPTDoubleHeadsModel)
model_tester.run_slow_tests() model_tester.run_slow_tests()
......
# coding=utf-8
# Copyright 2019 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import os
import shutil
import json
import random
import torch
def _config_zero_init(config):
configs_no_init = copy.deepcopy(config)
for key in configs_no_init.__dict__.keys():
if '_range' in key or '_std' in key:
setattr(configs_no_init, key, 0.0)
return configs_no_init
def _create_and_check_torchscript_output_attentions(tester, model_classes, config, inputs_dict):
config.output_attentions = True
_create_and_check_torchscript(tester, model_classes, config, inputs_dict)
def _create_and_check_torchscript_output_hidden_state(tester, model_classes, config, inputs_dict):
config.output_hidden_states = True
_create_and_check_torchscript(tester, model_classes, config, inputs_dict)
def _create_and_check_torchscript(tester, model_classes, config, inputs_dict):
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
configs_no_init.torchscript = True
for model_class in model_classes:
model = model_class(config=configs_no_init)
model.eval()
inputs = inputs_dict['input_ids'] # Let's keep only input_ids
try:
torch.jit.trace(model, inputs)
except RuntimeError:
tester.parent.fail("Couldn't trace module.")
try:
traced_gpt2 = torch.jit.trace(model, inputs)
torch.jit.save(traced_gpt2, "traced_model.pt")
except RuntimeError:
tester.parent.fail("Couldn't save module.")
try:
loaded_model = torch.jit.load("traced_model.pt")
os.remove("traced_model.pt")
except ValueError:
tester.parent.fail("Couldn't load module.")
model.eval()
loaded_model.eval()
model_params = model.parameters()
loaded_model_params = loaded_model.parameters()
models_equal = True
for p1, p2 in zip(model_params, loaded_model_params):
if p1.data.ne(p2.data).sum() > 0:
models_equal = False
tester.parent.assertTrue(models_equal)
def _create_and_check_initialization(tester, model_classes, config, inputs_dict):
configs_no_init = _config_zero_init(config)
for model_class in model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if param.requires_grad:
tester.parent.assertIn(param.data.mean().item(), [0.0, 1.0],
msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
def _create_and_check_for_headmasking(tester, model_classes, config, inputs_dict):
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
for model_class in model_classes:
config.output_attentions = True
config.output_hidden_states = True
model = model_class(config=configs_no_init)
model.eval()
# Prepare head_mask
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
head_mask = torch.ones(tester.num_hidden_layers, tester.num_attention_heads)
head_mask[0, 0] = 0
head_mask[-1, :-1] = 0
head_mask.requires_grad_(requires_grad=True)
inputs = inputs_dict.copy()
inputs['head_mask'] = head_mask
outputs = model(**inputs)
# Test that we can get a gradient back for importance score computation
output = sum(t.sum() for t in outputs[0])
output = output.sum()
output.backward()
multihead_outputs = head_mask.grad
attentions = outputs[-1]
hidden_states = outputs[-2]
# Remove Nan
tester.parent.assertIsNotNone(multihead_outputs)
tester.parent.assertEqual(len(multihead_outputs), tester.num_hidden_layers)
tester.parent.assertAlmostEqual(
attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
tester.parent.assertNotEqual(
attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
tester.parent.assertNotEqual(
attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
tester.parent.assertAlmostEqual(
attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
tester.parent.assertNotEqual(
attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
def _create_and_check_for_head_pruning(tester, model_classes, config, inputs_dict):
for model_class in model_classes:
config.output_attentions = True
config.output_hidden_states = False
model = model_class(config=config)
model.eval()
heads_to_prune = {0: list(range(1, tester.num_attention_heads)),
-1: [0]}
model.prune_heads(heads_to_prune)
outputs = model(**inputs_dict)
attentions = outputs[-1]
tester.parent.assertEqual(
attentions[0].shape[-3], 1)
tester.parent.assertEqual(
attentions[1].shape[-3], tester.num_attention_heads)
tester.parent.assertEqual(
attentions[-1].shape[-3], tester.num_attention_heads - 1)
def _create_and_check_for_attentions(tester, model_classes, config, inputs_dict):
for model_class in model_classes:
config.output_attentions = True
config.output_hidden_states = False
model = model_class(config)
model.eval()
outputs = model(**inputs_dict)
attentions = outputs[-1]
tester.parent.assertEqual(model.config.output_attentions, True)
tester.parent.assertEqual(model.config.output_hidden_states, False)
tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
tester.parent.assertListEqual(
list(attentions[0].shape[-3:]),
[tester.num_attention_heads,
tester.seq_length,
tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
out_len = len(outputs)
# Check attention is always last and order is fine
config.output_attentions = True
config.output_hidden_states = True
model = model_class(config)
model.eval()
outputs = model(**inputs_dict)
tester.parent.assertEqual(out_len+1, len(outputs))
tester.parent.assertEqual(model.config.output_attentions, True)
tester.parent.assertEqual(model.config.output_hidden_states, True)
attentions = outputs[-1]
tester.parent.assertEqual(len(attentions), tester.num_hidden_layers)
tester.parent.assertListEqual(
list(attentions[0].shape[-3:]),
[tester.num_attention_heads,
tester.seq_length,
tester.key_len if hasattr(tester, 'key_len') else tester.seq_length])
def _create_and_check_for_hidden_states(tester, model_classes, config, inputs_dict):
for model_class in model_classes:
config.output_hidden_states = True
config.output_attentions = False
model = model_class(config)
model.eval()
outputs = model(**inputs_dict)
hidden_states = outputs[-1]
tester.parent.assertEqual(model.config.output_attentions, False)
tester.parent.assertEqual(model.config.output_hidden_states, True)
tester.parent.assertEqual(len(hidden_states), tester.num_hidden_layers + 1)
tester.parent.assertListEqual(
list(hidden_states[0].shape[-2:]),
[tester.seq_length, tester.hidden_size])
def create_and_check_commons(tester, config, inputs_dict, test_pruning=True, test_torchscript=True):
_create_and_check_initialization(tester, tester.all_model_classes, config, inputs_dict)
_create_and_check_for_attentions(tester, tester.all_model_classes, config, inputs_dict)
_create_and_check_for_headmasking(tester, tester.all_model_classes, config, inputs_dict)
_create_and_check_for_hidden_states(tester, tester.all_model_classes, config, inputs_dict)
if test_torchscript:
_create_and_check_torchscript(tester, tester.all_model_classes, config, inputs_dict)
_create_and_check_torchscript_output_attentions(tester, tester.all_model_classes, config, inputs_dict)
_create_and_check_torchscript_output_hidden_state(tester, tester.all_model_classes, config, inputs_dict)
if test_pruning:
_create_and_check_for_head_pruning(tester, tester.all_model_classes, config, inputs_dict)
def ids_tensor(shape, vocab_size, rng=None, name=None):
"""Creates a random int32 tensor of the shape within the vocab size."""
if rng is None:
rng = random.Random()
total_dims = 1
for dim in shape:
total_dims *= dim
values = []
for _ in range(total_dims):
values.append(rng.randint(0, vocab_size - 1))
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
class ConfigTester(object):
def __init__(self, parent, config_class=None, **kwargs):
self.parent = parent
self.config_class = config_class
self.inputs_dict = kwargs
def create_and_test_config_common_properties(self):
config = self.config_class(**self.inputs_dict)
self.parent.assertTrue(hasattr(config, 'hidden_size'))
self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
def create_and_test_config_to_json_string(self):
config = self.config_class(**self.inputs_dict)
obj = json.loads(config.to_json_string())
for key, value in self.inputs_dict.items():
self.parent.assertEqual(obj[key], value)
def create_and_test_config_to_json_file(self):
config_first = self.config_class(**self.inputs_dict)
json_file_path = "/tmp/config.json"
config_first.to_json_file(json_file_path)
config_second = self.config_class.from_json_file(json_file_path)
os.remove(json_file_path)
self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
def run_common_tests(self):
self.create_and_test_config_common_properties()
self.create_and_test_config_to_json_string()
self.create_and_test_config_to_json_file()
class GPTModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_position_ids=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
n_special=1,
n_positions=33,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
n_choices=3,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
scope=None,
config_class=None,
base_model_class=None,
lm_head_model_class=None,
double_head_model_class=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_position_ids = use_position_ids
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.n_special = n_special
self.n_positions = n_positions
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.n_choices = n_choices
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.scope = scope
self.config_class = config_class
self.base_model_class = base_model_class
self.lm_head_model_class = lm_head_model_class
self.double_head_model_class = double_head_model_class
self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
def prepare_config_and_inputs(self):
total_num_tokens = self.vocab_size + self.n_special
input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
position_ids = None
if self.use_position_ids:
position_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)
token_type_ids = None
if self.use_token_type_ids:
total_voc = self.vocab_size
token_type_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
mc_labels = None
lm_labels = None
mc_token_ids = None
if self.use_labels:
mc_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
lm_labels = ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
config = self.config_class(
vocab_size_or_config_json_file=self.vocab_size,
n_special=self.n_special,
n_positions=self.n_positions,
n_embd=self.hidden_size,
n_layer=self.num_hidden_layers,
n_head=self.num_attention_heads,
initializer_range=self.initializer_range)
return (config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids)
def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
model = self.base_model_class(config)
model.eval()
outputs = model(input_ids, position_ids, token_type_ids)
outputs = model(input_ids, position_ids)
outputs = model(input_ids)
hidden_state = outputs[0]
self.parent.assertListEqual(
list(hidden_state.size()),
[self.batch_size, self.n_choices, self.seq_length, self.hidden_size])
def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
model = self.lm_head_model_class(config)
model.eval()
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
loss, lm_logits = outputs[:2]
total_voc = self.n_special + self.vocab_size
self.parent.assertListEqual(
list(lm_logits.size()),
[self.batch_size, self.n_choices, self.seq_length, total_voc])
self.parent.assertListEqual(
list(loss.size()),
[])
def create_and_check_presents(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
for model_class in self.all_model_classes:
model = model_class(config)
model.eval()
outputs = model(input_ids)
presents = outputs[-1]
self.parent.assertEqual(self.num_hidden_layers, len(presents))
self.parent.assertListEqual(
list(presents[0].size()),
[2, self.batch_size * self.n_choices, self.num_attention_heads,
self.seq_length, self.hidden_size // self.num_attention_heads])
def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
model = self.double_head_model_class(config)
model.eval()
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
token_type_ids=token_type_ids, position_ids=position_ids)
lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
loss = [lm_loss, mc_loss]
total_voc = self.n_special + self.vocab_size
self.parent.assertListEqual(
list(lm_logits.size()),
[self.batch_size, self.n_choices, self.seq_length, total_voc])
self.parent.assertListEqual(
list(mc_logits.size()),
[self.batch_size, self.n_choices])
self.parent.assertListEqual(
[list(l.size()) for l in loss],
[[], []])
def create_and_check_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_transformers_test/"
for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.parent.assertIsNotNone(model)
def create_and_check_commons(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
inputs_dict = {'input_ids': input_ids}
create_and_check_commons(self, config, inputs_dict)
def run_common_tests(self, test_presents=False):
config_and_inputs = self.prepare_config_and_inputs()
self.create_and_check_base_model(*config_and_inputs)
config_and_inputs = self.prepare_config_and_inputs()
self.create_and_check_lm_head(*config_and_inputs)
config_and_inputs = self.prepare_config_and_inputs()
self.create_and_check_double_heads(*config_and_inputs)
if test_presents:
config_and_inputs = self.prepare_config_and_inputs()
self.create_and_check_presents(*config_and_inputs)
config_and_inputs = self.prepare_config_and_inputs()
self.create_and_check_commons(*config_and_inputs)
def run_slow_tests(self):
config_and_inputs = self.prepare_config_and_inputs()
self.create_and_check_model_from_pretrained(*config_and_inputs)
...@@ -28,9 +28,15 @@ import torch ...@@ -28,9 +28,15 @@ import torch
from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel) from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
class TransfoXLModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (TransfoXLModel, TransfoXLLMHeadModel)
test_pruning = False
test_torchscript = False
test_resize_embeddings = False
class TransfoXLModelTest(unittest.TestCase):
class TransfoXLModelTester(object): class TransfoXLModelTester(object):
def __init__(self, def __init__(self,
...@@ -52,7 +58,6 @@ class TransfoXLModelTest(unittest.TestCase): ...@@ -52,7 +58,6 @@ class TransfoXLModelTest(unittest.TestCase):
num_hidden_layers=5, num_hidden_layers=5,
scope=None, scope=None,
seed=1, seed=1,
all_model_classes=(TransfoXLModel, TransfoXLLMHeadModel),
): ):
self.parent = parent self.parent = parent
self.batch_size = batch_size self.batch_size = batch_size
...@@ -73,7 +78,6 @@ class TransfoXLModelTest(unittest.TestCase): ...@@ -73,7 +78,6 @@ class TransfoXLModelTest(unittest.TestCase):
self.num_hidden_layers = num_hidden_layers self.num_hidden_layers = num_hidden_layers
self.scope = scope self.scope = scope
self.seed = seed self.seed = seed
self.all_model_classes = all_model_classes
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
...@@ -171,16 +175,31 @@ class TransfoXLModelTest(unittest.TestCase): ...@@ -171,16 +175,31 @@ class TransfoXLModelTest(unittest.TestCase):
list(list(mem.size()) for mem in result["mems_2"]), list(list(mem.size()) for mem in result["mems_2"]),
[[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
def create_and_check_transfo_xl_commons(self, config, input_ids_1, input_ids_2, lm_labels): def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
inputs_dict = {'input_ids': input_ids_1} inputs_dict = {'input_ids': input_ids_1}
create_and_check_commons(self, config, inputs_dict, test_pruning=False, test_torchscript=False) return config, inputs_dict
def test_default(self): def setUp(self):
self.run_tester(TransfoXLModelTest.TransfoXLModelTester(self)) self.model_tester = TransfoXLModelTest.TransfoXLModelTester(self)
self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
def test_config(self): def test_config(self):
config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37) self.config_tester.run_common_tests()
config_tester.run_common_tests()
def test_transfo_xl_model(self):
self.model_tester.set_seed()
config_and_inputs = self.model_tester.prepare_config_and_inputs()
output_result = self.model_tester.create_transfo_xl_model(*config_and_inputs)
self.model_tester.check_transfo_xl_model_output(output_result)
def test_transfo_xl_lm_head(self):
self.model_tester.set_seed()
config_and_inputs = self.model_tester.prepare_config_and_inputs()
output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
self.model_tester.check_transfo_xl_lm_head_output(output_result)
@pytest.mark.slow @pytest.mark.slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
...@@ -190,23 +209,6 @@ class TransfoXLModelTest(unittest.TestCase): ...@@ -190,23 +209,6 @@ class TransfoXLModelTest(unittest.TestCase):
shutil.rmtree(cache_dir) shutil.rmtree(cache_dir)
self.assertIsNotNone(model) self.assertIsNotNone(model)
def run_tester(self, tester):
config_and_inputs = tester.prepare_config_and_inputs()
tester.set_seed()
config_and_inputs = tester.prepare_config_and_inputs()
output_result = tester.create_transfo_xl_model(*config_and_inputs)
tester.check_transfo_xl_model_output(output_result)
tester.set_seed()
config_and_inputs = tester.prepare_config_and_inputs()
output_result = tester.create_transfo_xl_lm_head(*config_and_inputs)
tester.check_transfo_xl_lm_head_output(output_result)
tester.set_seed()
config_and_inputs = tester.prepare_config_and_inputs()
tester.create_and_check_transfo_xl_commons(*config_and_inputs)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
# coding=utf-8
# Copyright 2018 HuggingFace Inc..
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import logging
from pytorch_transformers import PretrainedConfig, PreTrainedModel
from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
class ModelUtilsTest(unittest.TestCase):
def test_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
config = BertConfig.from_pretrained(model_name)
self.assertIsNotNone(config)
self.assertIsInstance(config, PretrainedConfig)
model = BertModel.from_pretrained(model_name)
model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
self.assertIsNotNone(model)
self.assertIsInstance(model, PreTrainedModel)
for value in loading_info.values():
self.assertEqual(len(value), 0)
config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
self.assertEqual(model.config.output_attentions, True)
self.assertEqual(model.config.output_hidden_states, True)
self.assertEqual(model.config, config)
if __name__ == "__main__":
unittest.main()
...@@ -23,10 +23,15 @@ import pytest ...@@ -23,10 +23,15 @@ import pytest
from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification) from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor) from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
class XLMModelTest(unittest.TestCase): class XLMModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (XLMModel, XLMWithLMHeadModel,
XLMForQuestionAnswering, XLMForSequenceClassification)
# , XLMForSequenceClassification, XLMForTokenClassification),
class XLMModelTester(object): class XLMModelTester(object):
def __init__(self, def __init__(self,
...@@ -58,8 +63,6 @@ class XLMModelTest(unittest.TestCase): ...@@ -58,8 +63,6 @@ class XLMModelTest(unittest.TestCase):
summary_type="last", summary_type="last",
use_proj=True, use_proj=True,
scope=None, scope=None,
all_model_classes = (XLMModel, XLMWithLMHeadModel,
XLMForQuestionAnswering, XLMForSequenceClassification), # , XLMForSequenceClassification, XLMForTokenClassification),
): ):
self.parent = parent self.parent = parent
self.batch_size = batch_size self.batch_size = batch_size
...@@ -90,7 +93,6 @@ class XLMModelTest(unittest.TestCase): ...@@ -90,7 +93,6 @@ class XLMModelTest(unittest.TestCase):
self.num_labels = num_labels self.num_labels = num_labels
self.num_choices = num_choices self.num_choices = num_choices
self.scope = scope self.scope = scope
self.all_model_classes = all_model_classes
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
...@@ -237,28 +239,23 @@ class XLMModelTest(unittest.TestCase): ...@@ -237,28 +239,23 @@ class XLMModelTest(unittest.TestCase):
[self.batch_size, self.type_sequence_label_size]) [self.batch_size, self.type_sequence_label_size])
def create_and_check_xlm_commons(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, token_type_ids, input_lengths,
sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs
inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths} inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths}
create_and_check_commons(self, config, inputs_dict) return config, inputs_dict
def test_default(self): def setUp(self):
self.run_tester(XLMModelTest.XLMModelTester(self)) self.model_tester = XLMModelTest.XLMModelTester(self)
self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37)
def test_config(self): def test_config(self):
config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37) self.config_tester.run_common_tests()
config_tester.run_common_tests()
@pytest.mark.slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_transformers_test/"
for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
def run_tester(self, tester): def test_xlm_model(self):
config_and_inputs = tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
tester.create_and_check_xlm_model(*config_and_inputs) self.model_tester.create_and_check_xlm_model(*config_and_inputs)
# config_and_inputs = tester.prepare_config_and_inputs() # config_and_inputs = tester.prepare_config_and_inputs()
# tester.create_and_check_xlm_for_masked_lm(*config_and_inputs) # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs)
...@@ -275,8 +272,14 @@ class XLMModelTest(unittest.TestCase): ...@@ -275,8 +272,14 @@ class XLMModelTest(unittest.TestCase):
# config_and_inputs = tester.prepare_config_and_inputs() # config_and_inputs = tester.prepare_config_and_inputs()
# tester.create_and_check_xlm_for_token_classification(*config_and_inputs) # tester.create_and_check_xlm_for_token_classification(*config_and_inputs)
config_and_inputs = tester.prepare_config_and_inputs() @pytest.mark.slow
tester.create_and_check_xlm_commons(*config_and_inputs) def test_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_transformers_test/"
for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -28,9 +28,14 @@ import torch ...@@ -28,9 +28,14 @@ import torch
from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering) from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor
class XLNetModelTest(CommonTestCases.CommonModelTester):
all_model_classes=(XLNetModel, XLNetLMHeadModel,
XLNetForSequenceClassification, XLNetForQuestionAnswering)
test_pruning = False
class XLNetModelTest(unittest.TestCase):
class XLNetModelTester(object): class XLNetModelTester(object):
def __init__(self, def __init__(self,
...@@ -56,8 +61,6 @@ class XLNetModelTest(unittest.TestCase): ...@@ -56,8 +61,6 @@ class XLNetModelTest(unittest.TestCase):
initializer_range=0.05, initializer_range=0.05,
seed=1, seed=1,
type_vocab_size=2, type_vocab_size=2,
all_model_classes=(XLNetModel, XLNetLMHeadModel,
XLNetForSequenceClassification, XLNetForQuestionAnswering),
): ):
self.parent = parent self.parent = parent
self.batch_size = batch_size self.batch_size = batch_size
...@@ -82,7 +85,6 @@ class XLNetModelTest(unittest.TestCase): ...@@ -82,7 +85,6 @@ class XLNetModelTest(unittest.TestCase):
self.seed = seed self.seed = seed
self.type_vocab_size = type_vocab_size self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size self.type_sequence_label_size = type_sequence_label_size
self.all_model_classes = all_model_classes
def prepare_config_and_inputs(self): def prepare_config_and_inputs(self):
input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
...@@ -264,17 +266,41 @@ class XLNetModelTest(unittest.TestCase): ...@@ -264,17 +266,41 @@ class XLNetModelTest(unittest.TestCase):
list(list(mem.size()) for mem in result["mems_1"]), list(list(mem.size()) for mem in result["mems_1"]),
[[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers)
def create_and_check_xlnet_commons(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, def prepare_config_and_inputs_for_common(self):
target_mapping, inp_q, segment_ids, lm_labels, sequence_labels, is_impossible_labels): config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask,
target_mapping, inp_q, segment_ids, lm_labels,
sequence_labels, is_impossible_labels) = config_and_inputs
inputs_dict = {'input_ids': input_ids_1} inputs_dict = {'input_ids': input_ids_1}
create_and_check_commons(self, config, inputs_dict, test_pruning=False) return config, inputs_dict
def test_default(self): def setUp(self):
self.run_tester(XLNetModelTest.XLNetModelTester(self)) self.model_tester = XLNetModelTest.XLNetModelTester(self)
self.config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37)
def test_config(self): def test_config(self):
config_tester = ConfigTester(self, config_class=XLNetConfig, d_inner=37) self.config_tester.run_common_tests()
config_tester.run_common_tests()
def test_xlnet_base_model(self):
self.model_tester.set_seed()
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlnet_base_model(*config_and_inputs)
def test_xlnet_lm_head(self):
self.model_tester.set_seed()
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlnet_lm_head(*config_and_inputs)
def test_xlnet_sequence_classif(self):
self.model_tester.set_seed()
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
def test_xlnet_qa(self):
self.model_tester.set_seed()
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_xlnet_qa(*config_and_inputs)
@pytest.mark.slow @pytest.mark.slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
...@@ -284,27 +310,6 @@ class XLNetModelTest(unittest.TestCase): ...@@ -284,27 +310,6 @@ class XLNetModelTest(unittest.TestCase):
shutil.rmtree(cache_dir) shutil.rmtree(cache_dir)
self.assertIsNotNone(model) self.assertIsNotNone(model)
def run_tester(self, tester):
tester.set_seed()
config_and_inputs = tester.prepare_config_and_inputs()
tester.create_and_check_xlnet_base_model(*config_and_inputs)
tester.set_seed()
config_and_inputs = tester.prepare_config_and_inputs()
tester.create_and_check_xlnet_lm_head(*config_and_inputs)
tester.set_seed()
config_and_inputs = tester.prepare_config_and_inputs()
tester.create_and_check_xlnet_sequence_classif(*config_and_inputs)
tester.set_seed()
config_and_inputs = tester.prepare_config_and_inputs()
tester.create_and_check_xlnet_qa(*config_and_inputs)
tester.set_seed()
config_and_inputs = tester.prepare_config_and_inputs()
tester.create_and_check_xlnet_commons(*config_and_inputs)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment