Commit 40ed7172 authored by erenup's avatar erenup
Browse files

Merge remote-tracking branch 'refs/remotes/huggingface/master'

parents 86a63070 7296f101
......@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
from torch.nn import functional as F
from .configuration_utils import PretrainedConfig
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME,
cached_path, hf_bucket_url, is_remote_url)
logger = logging.getLogger(__name__)
......@@ -53,7 +54,7 @@ class PreTrainedModel(nn.Module):
r""" Base class for all models.
:class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
......@@ -83,55 +84,59 @@ class PreTrainedModel(nn.Module):
# Save config in model
self.config = config
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
""" Build a resized Embedding Module from a provided token Embedding Module.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
@property
def base_model(self):
return getattr(self, self.base_model_prefix, self)
Args:
new_num_tokens: (`optional`) int
New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
If not provided or None: return the provided token Embedding Module.
Return: ``torch.nn.Embeddings``
Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
def get_input_embeddings(self):
""" Get model's input embeddings
"""
if new_num_tokens is None:
return old_embeddings
old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
if old_num_tokens == new_num_tokens:
return old_embeddings
# Build new embeddings
new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
new_embeddings.to(old_embeddings.weight.device)
base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self:
return base_model.get_input_embeddings()
else:
raise NotImplementedError
# initialize all new embeddings (in particular added tokens)
self._init_weights(new_embeddings)
def set_input_embeddings(self, value):
""" Set model's input embeddings
"""
base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self:
base_model.set_input_embeddings(value)
else:
raise NotImplementedError
# Copy word embeddings from the previous weights
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
def get_output_embeddings(self):
""" Get model's output embeddings
Return None if the model doesn't have output embeddings
"""
return None # Overwrite for models with output embeddings
return new_embeddings
def tie_weights(self):
""" Make sure we are sharing the input and output embeddings.
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
"""
output_embeddings = self.get_output_embeddings()
if output_embeddings is not None:
self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
def _tie_or_clone_weights(self, first_module, second_module):
def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
""" Tie or clone module weights depending of weither we are using TorchScript or not
"""
if self.config.torchscript:
first_module.weight = nn.Parameter(second_module.weight.clone())
output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
else:
first_module.weight = second_module.weight
output_embeddings.weight = input_embeddings.weight
if hasattr(first_module, 'bias') and first_module.bias is not None:
first_module.bias.data = torch.nn.functional.pad(
first_module.bias.data,
(0, first_module.weight.shape[0] - first_module.bias.shape[0]),
if hasattr(output_embeddings, 'bias') and output_embeddings.bias is not None:
output_embeddings.bias.data = torch.nn.functional.pad(
output_embeddings.bias.data,
(0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0]),
'constant',
0
)
if hasattr(output_embeddings, 'out_features') and hasattr(input_embeddings, 'num_embeddings'):
output_embeddings.out_features = input_embeddings.num_embeddings
def resize_token_embeddings(self, new_num_tokens=None):
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
......@@ -161,6 +166,46 @@ class PreTrainedModel(nn.Module):
return model_embeds
def _resize_token_embeddings(self, new_num_tokens):
old_embeddings = self.get_input_embeddings()
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
self.set_input_embeddings(new_embeddings)
return self.get_input_embeddings()
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
""" Build a resized Embedding Module from a provided token Embedding Module.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
Args:
new_num_tokens: (`optional`) int
New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
If not provided or None: return the provided token Embedding Module.
Return: ``torch.nn.Embeddings``
Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
"""
if new_num_tokens is None:
return old_embeddings
old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
if old_num_tokens == new_num_tokens:
return old_embeddings
# Build new embeddings
new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
new_embeddings.to(old_embeddings.weight.device)
# initialize all new embeddings (in particular added tokens)
self._init_weights(new_embeddings)
# Copy word embeddings from the previous weights
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
return new_embeddings
def init_weights(self):
""" Initialize and prunes weights if needed. """
# Initialize weights
......@@ -170,6 +215,9 @@ class PreTrainedModel(nn.Module):
if self.config.pruned_heads:
self.prune_heads(self.config.pruned_heads)
# Tie weights if needed
self.tie_weights()
def prune_heads(self, heads_to_prune):
""" Prunes heads of the base model.
......@@ -178,14 +226,12 @@ class PreTrainedModel(nn.Module):
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
"""
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
# save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
for layer, heads in heads_to_prune.items():
union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON
base_model._prune_heads(heads_to_prune)
self.base_model._prune_heads(heads_to_prune)
def save_pretrained(self, save_directory):
""" Save a model and its configuration file to a directory, so that it
......@@ -193,7 +239,7 @@ class PreTrainedModel(nn.Module):
"""
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
# Only save the model it-self if we are using distributed training
# Only save the model itself if we are using distributed training
model_to_save = self.module if hasattr(self, 'module') else self
# Save configuration file
......@@ -220,6 +266,7 @@ class PreTrainedModel(nn.Module):
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
......@@ -246,6 +293,9 @@ class PreTrainedModel(nn.Module):
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
......@@ -270,11 +320,17 @@ class PreTrainedModel(nn.Module):
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
"""
if pretrained_model_name_or_path is not None and (
"albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path):
logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
"https://github.com/google-research/google-research/issues/119 for more information.")
config = kwargs.pop('config', None)
state_dict = kwargs.pop('state_dict', None)
cache_dir = kwargs.pop('cache_dir', None)
from_tf = kwargs.pop('from_tf', False)
force_download = kwargs.pop('force_download', False)
resume_download = kwargs.pop('resume_download', False)
proxies = kwargs.pop('proxies', None)
output_loading_info = kwargs.pop('output_loading_info', False)
......@@ -284,6 +340,8 @@ class PreTrainedModel(nn.Module):
pretrained_model_name_or_path, *model_args,
cache_dir=cache_dir, return_unused_kwargs=True,
force_download=force_download,
resume_download=resume_download,
proxies=proxies,
**kwargs
)
else:
......@@ -307,15 +365,21 @@ class PreTrainedModel(nn.Module):
raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
[WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
pretrained_model_name_or_path))
elif os.path.isfile(pretrained_model_name_or_path):
elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
archive_file = pretrained_model_name_or_path
else:
assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
elif os.path.isfile(pretrained_model_name_or_path + ".index"):
assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
pretrained_model_name_or_path + ".index")
archive_file = pretrained_model_name_or_path + ".index"
else:
archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
if from_tf:
raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")
# redirect to the cache, if necessary
try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
proxies=proxies, resume_download=resume_download)
except EnvironmentError:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
......@@ -371,6 +435,8 @@ class PreTrainedModel(nn.Module):
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if key == 'lm_head.decoder.weight':
new_key = 'lm_head.weight'
if new_key:
old_keys.append(key)
new_keys.append(new_key)
......@@ -383,6 +449,8 @@ class PreTrainedModel(nn.Module):
if metadata is not None:
state_dict._metadata = metadata
# PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
# so we need to apply the function recursively.
def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
......@@ -680,7 +748,7 @@ class SequenceSummary(nn.Module):
def __init__(self, config):
super(SequenceSummary, self).__init__()
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
self.summary_type = config.summary_type if hasattr(config, 'summary_type') else 'last'
if self.summary_type == 'attn':
# We should use a standard multi-head attention module with absolute positional embedding for that.
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
......
......@@ -73,15 +73,15 @@ def get_masks(slen, lengths, causal, padding_mask=None):
"""
Generate hidden states mask, and optionally an attention mask.
"""
bs = lengths.size(0)
alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
if padding_mask is not None:
mask = padding_mask
else:
assert lengths.max().item() <= slen
alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
mask = alen < lengths[:, None]
# attention mask is the same as mask, or triangular inferior attention (causal)
bs = lengths.size(0)
if causal:
attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
else:
......@@ -311,6 +311,10 @@ XLM_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
......@@ -407,10 +411,12 @@ class XLMModel(XLMPreTrainedModel):
self.init_weights()
def _resize_token_embeddings(self, new_num_tokens):
self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
def get_input_embeddings(self):
return self.embeddings
def set_input_embeddings(self, new_embeddings):
self.embeddings = new_embeddings
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
......@@ -419,14 +425,21 @@ class XLMModel(XLMPreTrainedModel):
for layer, heads in heads_to_prune.items():
self.attentions[layer].prune_heads(heads)
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None): # removed: src_enc=None, src_len=None
def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, inputs_embeds=None): # removed: src_enc=None, src_len=None
if input_ids is not None:
bs, slen = input_ids.size()
else:
bs, slen = inputs_embeds.size()[:-1]
if lengths is None:
lengths = (input_ids != self.pad_index).sum(dim=1).long()
if input_ids is not None:
lengths = (input_ids != self.pad_index).sum(dim=1).long()
else:
lengths = torch.LongTensor([slen]*bs)
# mask = input_ids != self.pad_index
# check inputs
bs, slen = input_ids.size()
assert lengths.size(0) == bs
assert lengths.max().item() <= slen
# input_ids = input_ids.transpose(0, 1) # batch size as dimension 0
......@@ -440,10 +453,12 @@ class XLMModel(XLMPreTrainedModel):
# if self.is_decoder and src_enc is not None:
# src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
device = input_ids.device if input_ids is not None else inputs_embeds.device
# position_ids
if position_ids is None:
position_ids = input_ids.new((slen,)).long()
position_ids = torch.arange(slen, out=position_ids).unsqueeze(0)
position_ids = torch.arange(slen, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0).expand((bs, slen))
else:
assert position_ids.size() == (bs, slen) # (slen, bs)
# position_ids = position_ids.transpose(0, 1)
......@@ -469,7 +484,7 @@ class XLMModel(XLMPreTrainedModel):
head_mask = [None] * self.n_layers
# do not recompute cached elements
if cache is not None:
if cache is not None and input_ids is not None:
_slen = slen - cache['slen']
input_ids = input_ids[:, -_slen:]
position_ids = position_ids[:, -_slen:]
......@@ -479,8 +494,10 @@ class XLMModel(XLMPreTrainedModel):
attn_mask = attn_mask[:, -_slen:]
# embeddings
tensor = self.embeddings(input_ids)
tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
if inputs_embeds is None:
inputs_embeds = self.embeddings(input_ids)
tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
if langs is not None and self.use_lang_emb:
tensor = tensor + self.lang_embeddings(langs)
if token_type_ids is not None:
......@@ -618,15 +635,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
self.pred_layer = XLMPredLayer(config)
self.init_weights()
self.tie_weights()
def tie_weights(self):
""" Make sure we are sharing the embeddings
"""
self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
def get_output_embeddings(self):
return self.pred_layer.proj
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, labels=None):
def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
langs=langs,
......@@ -634,7 +648,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask)
head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0]
outputs = self.pred_layer(output, labels)
......@@ -686,8 +701,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, labels=None):
def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
langs=langs,
......@@ -695,7 +710,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask)
head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
......@@ -769,8 +785,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None):
def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
langs=langs,
......@@ -778,7 +794,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask)
head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = transformer_outputs[0]
......@@ -864,8 +881,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None,
def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None,
is_impossible=None, cls_index=None, p_mask=None):
transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
......@@ -874,7 +891,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask)
head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0]
......
......@@ -558,6 +558,10 @@ XLNET_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
"""
@add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
......@@ -579,6 +583,7 @@ class XLNetModel(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples::
......@@ -611,10 +616,12 @@ class XLNetModel(XLNetPreTrainedModel):
self.init_weights()
def _resize_token_embeddings(self, new_num_tokens):
self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
def get_input_embeddings(self):
return self.word_embedding
def set_input_embeddings(self, new_embeddings):
self.word_embedding = new_embeddings
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
......@@ -710,19 +717,29 @@ class XLNetModel(XLNetPreTrainedModel):
pos_emb = pos_emb.to(next(self.parameters()))
return pos_emb
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None):
def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None):
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end
input_ids = input_ids.transpose(0, 1).contiguous()
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_ids = input_ids.transpose(0, 1).contiguous()
qlen, bsz = input_ids.shape[0], input_ids.shape[1]
elif inputs_embeds is not None:
inputs_embeds.transpose(0, 1).contiguous()
qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
qlen, bsz = input_ids.shape[0], input_ids.shape[1]
mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
klen = mlen + qlen
......@@ -775,7 +792,10 @@ class XLNetModel(XLNetPreTrainedModel):
non_tgt_mask = None
##### Word embeddings and prepare h & g hidden states
word_emb_k = self.word_embedding(input_ids)
if inputs_embeds is not None:
word_emb_k = inputs_embeds
else:
word_emb_k = self.word_embedding(input_ids)
output_h = self.dropout(word_emb_k)
if target_mapping is not None:
word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
......@@ -859,7 +879,11 @@ class XLNetModel(XLNetPreTrainedModel):
hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
outputs = outputs + (hidden_states,)
if self.output_attentions:
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
if target_mapping is not None:
# when target_mapping is provided, there are 2-tuple of attentions
attentions = tuple(tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions)
else:
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
outputs = outputs + (attentions,)
return outputs # outputs, (new_mems), (hidden_states), (attentions)
......@@ -894,6 +918,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples::
......@@ -918,15 +943,12 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
self.init_weights()
self.tie_weights()
def tie_weights(self):
""" Make sure we are sharing the embeddings
"""
self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
def get_output_embeddings(self):
return self.lm_loss
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, labels=None):
def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
mems=mems,
......@@ -934,7 +956,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask)
head_mask=head_mask,
inputs_embeds=inputs_embeds)
logits = self.lm_loss(transformer_outputs[0])
......@@ -978,6 +1001,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples::
......@@ -999,8 +1023,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, labels=None):
def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
mems=mems,
......@@ -1008,7 +1032,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask)
head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0]
output = self.sequence_summary(output)
......@@ -1028,6 +1053,106 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
return outputs # return (loss), logits, (mems), (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
XLNET_START_DOCSTRING,
XLNET_INPUTS_DOCSTRING)
class XLNetForTokenClassification(XLNetPreTrainedModel):
r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to scores.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss.
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
Classification scores (before SoftMax).
**mems**: (`optional`, returned when ``config.mem_len > 0``)
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
scores = outputs[0]
"""
def __init__(self, config):
super(XLNetForTokenClassification, self).__init__(config)
self.num_labels = config.num_labels
self.transformer = XLNetModel(config)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
outputs = self.transformer(input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = outputs[0]
logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[1:] # Keep mems, hidden states, attentions if there are in it
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs # return (loss), logits, (mems), (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
......@@ -1050,6 +1175,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
......@@ -1073,6 +1202,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples::
......@@ -1094,9 +1224,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
self.init_weights()
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
def forward(self, input_ids=None, token_type_ids=None, input_mask=None, attention_mask=None,
mems=None, perm_mask=None, target_mapping=None,
labels=None, head_mask=None):
labels=None, head_mask=None, inputs_embeds=None):
num_choices = input_ids.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
......@@ -1107,7 +1237,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids,
input_mask=flat_input_mask, attention_mask=flat_attention_mask,
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
head_mask=head_mask)
head_mask=head_mask, inputs_embeds=inputs_embeds)
output = transformer_outputs[0]
......@@ -1158,6 +1288,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples::
......@@ -1179,8 +1310,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None,
def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
start_positions=None, end_positions=None):
outputs = self.transformer(input_ids,
......@@ -1190,7 +1321,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask)
head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = outputs[0]
......@@ -1271,6 +1403,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples::
......@@ -1295,8 +1428,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None,
def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
......@@ -1305,7 +1438,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask)
head_mask=head_mask,
inputs_embeds=inputs_embeds)
hidden_states = transformer_outputs[0]
start_logits = self.start_logits(hidden_states, p_mask=p_mask)
......
......@@ -23,86 +23,66 @@ from torch.optim.lr_scheduler import LambdaLR
logger = logging.getLogger(__name__)
class ConstantLRSchedule(LambdaLR):
""" Constant learning rate schedule.
def get_constant_schedule(optimizer, last_epoch=-1):
""" Create a schedule with a constant learning rate.
"""
def __init__(self, optimizer, last_epoch=-1):
super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
class WarmupConstantSchedule(LambdaLR):
""" Linear warmup and then constant.
Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
Keeps learning rate schedule equal to 1. after warmup_steps.
def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
""" Create a schedule with a constant learning rate preceded by a warmup
period during which the learning rate increases linearly between 0 and 1.
"""
def __init__(self, optimizer, warmup_steps, last_epoch=-1):
self.warmup_steps = warmup_steps
super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1.0, self.warmup_steps))
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1.0, num_warmup_steps))
return 1.
return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
class WarmupLinearSchedule(LambdaLR):
""" Linear warmup and then linear decay.
Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
"""
def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
self.warmup_steps = warmup_steps
self.t_total = t_total
super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1, self.warmup_steps))
return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
class WarmupCosineSchedule(LambdaLR):
""" Linear warmup and then cosine decay.
Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
""" Create a schedule with a learning rate that decreases linearly after
linearly increasing during a warmup period.
"""
def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
self.warmup_steps = warmup_steps
self.t_total = t_total
self.cycles = cycles
super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1.0, self.warmup_steps))
# progress after warmup
progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
class WarmupCosineWithHardRestartsSchedule(LambdaLR):
""" Linear warmup and then cosine cycles with hard restarts.
Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
learning rate (with hard restarts).
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
return LambdaLR(optimizer, lr_lambda, last_epoch)
def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
""" Create a schedule with a learning rate that decreases following the
values of the cosine function between 0 and `pi * cycles` after a warmup
period during which it increases linearly between 0 and 1.
"""
def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
self.warmup_steps = warmup_steps
self.t_total = t_total
self.cycles = cycles
super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1, self.warmup_steps))
# progress after warmup
progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
if progress >= 1.0:
return 0.0
return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
return LambdaLR(optimizer, lr_lambda, last_epoch)
def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1):
""" Create a schedule with a learning rate that decreases following the
values of the cosine function with several hard restarts, after a warmup
period during which it increases linearly between 0 and 1.
"""
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
if progress >= 1.:
return 0.
return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.))))
return LambdaLR(optimizer, lr_lambda, last_epoch)
class AdamW(Optimizer):
""" Implements Adam algorithm with weight decay fix.
......
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to optimization (weight updates)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Applys a warmup schedule on a given learning rate decay schedule."""
def __init__(
self,
initial_learning_rate,
decay_schedule_fn,
warmup_steps,
power=1.0,
name=None):
super(WarmUp, self).__init__()
self.initial_learning_rate = initial_learning_rate
self.warmup_steps = warmup_steps
self.power = power
self.decay_schedule_fn = decay_schedule_fn
self.name = name
def __call__(self, step):
with tf.name_scope(self.name or 'WarmUp') as name:
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
global_step_float = tf.cast(step, tf.float32)
warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
warmup_percent_done = global_step_float / warmup_steps_float
warmup_learning_rate = (
self.initial_learning_rate *
tf.math.pow(warmup_percent_done, self.power))
return tf.cond(global_step_float < warmup_steps_float,
lambda: warmup_learning_rate,
lambda: self.decay_schedule_fn(step),
name=name)
def get_config(self):
return {
'initial_learning_rate': self.initial_learning_rate,
'decay_schedule_fn': self.decay_schedule_fn,
'warmup_steps': self.warmup_steps,
'power': self.power,
'name': self.name
}
def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
"""Creates an optimizer with learning rate schedule."""
# Implements linear decay of the learning rate.
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=init_lr,
decay_steps=num_train_steps,
end_learning_rate=0.0)
if num_warmup_steps:
learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
decay_schedule_fn=learning_rate_fn,
warmup_steps=num_warmup_steps)
optimizer = AdamWeightDecay(
learning_rate=learning_rate_fn,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=['layer_norm', 'bias'])
return optimizer
class AdamWeightDecay(tf.keras.optimizers.Adam):
"""Adam enables L2 weight decay and clip_by_global_norm on gradients.
Just adding the square of the weights to the loss function is *not* the
correct way of using L2 regularization/weight decay with Adam, since that will
interact with the m and v parameters in strange ways.
Instead we want ot decay the weights in a manner that doesn't interact with
the m/v parameters. This is equivalent to adding the square of the weights to
the loss with plain (non-momentum) SGD.
"""
def __init__(self,
learning_rate=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-7,
amsgrad=False,
weight_decay_rate=0.0,
include_in_weight_decay=None,
exclude_from_weight_decay=None,
name='AdamWeightDecay',
**kwargs):
super(AdamWeightDecay, self).__init__(
learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
self.weight_decay_rate = weight_decay_rate
self._include_in_weight_decay = include_in_weight_decay
self._exclude_from_weight_decay = exclude_from_weight_decay
@classmethod
def from_config(cls, config):
"""Creates an optimizer from its config with WarmUp custom object."""
custom_objects = {'WarmUp': WarmUp}
return super(AdamWeightDecay, cls).from_config(
config, custom_objects=custom_objects)
def _prepare_local(self, var_device, var_dtype, apply_state):
super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
apply_state)
apply_state['weight_decay_rate'] = tf.constant(
self.weight_decay_rate, name='adam_weight_decay_rate')
def _decay_weights_op(self, var, learning_rate, apply_state):
do_decay = self._do_use_weight_decay(var.name)
if do_decay:
return var.assign_sub(
learning_rate * var *
apply_state['weight_decay_rate'],
use_locking=self._use_locking)
return tf.no_op()
def apply_gradients(self, grads_and_vars, clip_norm, name=None):
grads, tvars = list(zip(*grads_and_vars))
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
def _get_lr(self, var_device, var_dtype, apply_state):
"""Retrieves the learning rate with the given state."""
if apply_state is None:
return self._decayed_lr_t[var_dtype], {}
apply_state = apply_state or {}
coefficients = apply_state.get((var_device, var_dtype))
if coefficients is None:
coefficients = self._fallback_apply_state(var_device, var_dtype)
apply_state[(var_device, var_dtype)] = coefficients
return coefficients['lr_t'], dict(apply_state=apply_state)
def _resource_apply_dense(self, grad, var, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
return super(AdamWeightDecay, self)._resource_apply_dense(
grad, var, **kwargs)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
return super(AdamWeightDecay, self)._resource_apply_sparse(
grad, var, indices, **kwargs)
def get_config(self):
config = super(AdamWeightDecay, self).get_config()
config.update({
'weight_decay_rate': self.weight_decay_rate,
})
return config
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if self.weight_decay_rate == 0:
return False
if self._include_in_weight_decay:
for r in self._include_in_weight_decay:
if re.search(r, param_name) is not None:
return True
if self._exclude_from_weight_decay:
for r in self._exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
class GradientAccumulator(object):
"""Distribution strategies-aware gradient accumulation utility."""
def __init__(self):
"""Initializes the accumulator."""
self._gradients = []
self._accum_steps = tf.Variable(
initial_value=0,
dtype=tf.int64,
trainable=False,
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
@property
def step(self):
"""Number of accumulated steps."""
return self._accum_steps.value()
@property
def gradients(self):
"""The accumulated gradients."""
return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
def __call__(self, gradients):
"""Accumulates :obj:`gradients`."""
if not self._gradients:
self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
if len(gradients) != len(self._gradients):
raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
if accum_gradient is not None:
accum_gradient.assign_add(gradient)
self._accum_steps.assign_add(1)
def reset(self):
"""Resets the accumulated gradients."""
if self._gradients:
self._accum_steps.assign(0)
for gradient in self._get_replica_gradients():
if gradient is not None:
gradient.assign(tf.zeros_like(gradient))
def _get_replica_gradients(self):
if tf.distribute.has_strategy():
# In a replica context, we want to accumulate gradients on each replica
# without synchronization, so we directly assign the value of the
# current replica.
replica_context = tf.distribute.get_replica_context()
if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
return self._gradients
return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
else:
return self._gradients
# content of conftest.py
import pytest
def pytest_addoption(parser):
parser.addoption(
"--runslow", action="store_true", default=False, help="run slow tests"
)
def pytest_collection_modifyitems(config, items):
if config.getoption("--runslow"):
# --runslow given in cli: do not skip slow tests
return
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
for item in items:
if "slow" in item.keywords:
item.add_marker(skip_slow)
# coding=utf-8
# Copyright 2019-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, division, print_function
import os
import six
import time
import unittest
from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
USER = "__DUMMY_TRANSFORMERS_USER__"
PASS = "__DUMMY_TRANSFORMERS_PASS__"
FILE_KEY = "Test-{}.txt".format(int(time.time()))
FILE_PATH = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
)
class HfApiCommonTest(unittest.TestCase):
_api = HfApi(endpoint="https://moon-staging.huggingface.co")
class HfApiLoginTest(HfApiCommonTest):
def test_login_invalid(self):
with self.assertRaises(HTTPError):
self._api.login(username=USER, password="fake")
def test_login_valid(self):
token = self._api.login(username=USER, password=PASS)
self.assertIsInstance(token, six.string_types)
class HfApiEndpointsTest(HfApiCommonTest):
@classmethod
def setUpClass(cls):
"""
Share this valid token in all tests below.
"""
cls._token = cls._api.login(username=USER, password=PASS)
def test_whoami(self):
user = self._api.whoami(token=self._token)
self.assertEqual(user, USER)
def test_presign(self):
urls = self._api.presign(token=self._token, filename=FILE_KEY)
self.assertIsInstance(urls, PresignedUrl)
self.assertEqual(urls.type, "text/plain")
def test_presign_and_upload(self):
access_url = self._api.presign_and_upload(
token=self._token, filename=FILE_KEY, filepath=FILE_PATH
)
self.assertIsInstance(access_url, six.string_types)
def test_list_objs(self):
objs = self._api.list_objs(token=self._token)
self.assertIsInstance(objs, list)
if len(objs) > 0:
o = objs[-1]
self.assertIsInstance(o, S3Obj)
class HfFolderTest(unittest.TestCase):
def test_token_workflow(self):
"""
Test the whole token save/get/delete workflow,
with the desired behavior with respect to non-existent tokens.
"""
token = "token-{}".format(int(time.time()))
HfFolder.save_token(token)
self.assertEqual(
HfFolder.get_token(),
token
)
HfFolder.delete_token()
HfFolder.delete_token()
# ^^ not an error, we test that the
# second call does not fail.
self.assertEqual(
HfFolder.get_token(),
None
)
if __name__ == "__main__":
unittest.main()
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
from transformers import is_torch_available
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
if is_torch_available():
from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
AlbertForSequenceClassification, AlbertForQuestionAnswering,
)
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@require_torch
class AlbertModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
class AlbertModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
embedding_size=16,
hidden_size=36,
num_hidden_layers=6,
num_hidden_groups=6,
num_attention_heads=6,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
self.num_hidden_groups = num_hidden_groups
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = AlbertConfig(
vocab_size_or_config_json_file=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range,
num_hidden_groups=self.num_hidden_groups)
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def check_loss_output(self, result):
self.parent.assertListEqual(
list(result["loss"].size()),
[])
def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = AlbertModel(config=config)
model.to(torch_device)
model.eval()
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids)
result = {
"sequence_output": sequence_output,
"pooled_output": pooled_output,
}
self.parent.assertListEqual(
list(result["sequence_output"].size()),
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = AlbertForMaskedLM(config=config)
model.to(torch_device)
model.eval()
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
result = {
"loss": loss,
"prediction_scores": prediction_scores,
}
self.parent.assertListEqual(
list(result["prediction_scores"].size()),
[self.batch_size, self.seq_length, self.vocab_size])
self.check_loss_output(result)
def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = AlbertForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
start_positions=sequence_labels, end_positions=sequence_labels)
result = {
"loss": loss,
"start_logits": start_logits,
"end_logits": end_logits,
}
self.parent.assertListEqual(
list(result["start_logits"].size()),
[self.batch_size, self.seq_length])
self.parent.assertListEqual(
list(result["end_logits"].size()),
[self.batch_size, self.seq_length])
self.check_loss_output(result)
def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = AlbertForSequenceClassification(config)
model.to(torch_device)
model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
result = {
"loss": loss,
"logits": logits,
}
self.parent.assertListEqual(
list(result["logits"].size()),
[self.batch_size, self.num_labels])
self.check_loss_output(result)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, token_type_ids, input_mask,
sequence_labels, token_labels, choice_labels) = config_and_inputs
inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
return config, inputs_dict
def setUp(self):
self.model_tester = AlbertModelTest.AlbertModelTester(self)
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_albert_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_model(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = AlbertModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__":
unittest.main()
......@@ -18,11 +18,12 @@ from __future__ import print_function
import unittest
import shutil
import pytest
import logging
from transformers import is_torch_available
from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
if is_torch_available():
from transformers import (AutoConfig, BertConfig,
AutoModel, BertModel,
......@@ -33,11 +34,11 @@ if is_torch_available():
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
else:
pytestmark = pytest.mark.skip("Require Torch")
@require_torch
class AutoModelTest(unittest.TestCase):
@slow
def test_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......@@ -52,6 +53,7 @@ class AutoModelTest(unittest.TestCase):
for value in loading_info.values():
self.assertEqual(len(value), 0)
@slow
def test_lmhead_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......@@ -64,6 +66,7 @@ class AutoModelTest(unittest.TestCase):
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForMaskedLM)
@slow
def test_sequence_classification_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......@@ -76,6 +79,7 @@ class AutoModelTest(unittest.TestCase):
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForSequenceClassification)
@slow
def test_question_answering_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......@@ -88,6 +92,11 @@ class AutoModelTest(unittest.TestCase):
self.assertIsNotNone(model)
self.assertIsInstance(model, BertForQuestionAnswering)
def test_from_pretrained_identifier(self):
logging.basicConfig(level=logging.INFO)
model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
self.assertIsInstance(model, BertForMaskedLM)
if __name__ == "__main__":
unittest.main()
......@@ -18,28 +18,27 @@ from __future__ import print_function
import unittest
import shutil
import pytest
from transformers import is_torch_available
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
if is_torch_available():
from transformers import (BertConfig, BertModel, BertForMaskedLM,
BertForNextSentencePrediction, BertForPreTraining,
BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification, BertForMultipleChoice)
BertForNextSentencePrediction, BertForPreTraining,
BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification, BertForMultipleChoice)
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
else:
pytestmark = pytest.mark.skip("Require Torch")
@require_torch
class BertModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification) if is_torch_available() else ()
BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification) if is_torch_available() else ()
class BertModelTester(object):
......@@ -66,7 +65,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
num_labels=3,
num_choices=4,
scope=None,
):
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
......@@ -120,10 +119,20 @@ class BertModelTest(CommonTestCases.CommonModelTester):
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
is_decoder=False,
initializer_range=self.initializer_range)
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def prepare_config_and_inputs_for_decoder(self):
config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels = self.prepare_config_and_inputs()
config.is_decoder = True
encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask
def check_loss_output(self, result):
self.parent.assertListEqual(
list(result["loss"].size()),
......@@ -131,6 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertModel(config=config)
model.to(torch_device)
model.eval()
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
......@@ -145,9 +155,26 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
model = BertModel(config)
model.to(torch_device)
model.eval()
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
result = {
"sequence_output": sequence_output,
"pooled_output": pooled_output,
}
self.parent.assertListEqual(
list(result["sequence_output"].size()),
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForMaskedLM(config=config)
model.to(torch_device)
model.eval()
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
result = {
......@@ -159,8 +186,24 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, self.seq_length, self.vocab_size])
self.check_loss_output(result)
def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
model = BertForMaskedLM(config=config)
model.to(torch_device)
model.eval()
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
result = {
"loss": loss,
"prediction_scores": prediction_scores,
}
self.parent.assertListEqual(
list(result["prediction_scores"].size()),
[self.batch_size, self.seq_length, self.vocab_size])
self.check_loss_output(result)
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForNextSentencePrediction(config=config)
model.to(torch_device)
model.eval()
loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
result = {
......@@ -172,9 +215,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, 2])
self.check_loss_output(result)
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForPreTraining(config=config)
model.to(torch_device)
model.eval()
loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
......@@ -191,9 +234,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, 2])
self.check_loss_output(result)
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
start_positions=sequence_labels, end_positions=sequence_labels)
......@@ -210,10 +253,10 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, self.seq_length])
self.check_loss_output(result)
def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = BertForSequenceClassification(config)
model.to(torch_device)
model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
result = {
......@@ -225,10 +268,10 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, self.num_labels])
self.check_loss_output(result)
def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = BertForTokenClassification(config=config)
model.to(torch_device)
model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
result = {
......@@ -240,10 +283,10 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, self.seq_length, self.num_labels])
self.check_loss_output(result)
def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_choices = self.num_choices
model = BertForMultipleChoice(config=config)
model.to(torch_device)
model.eval()
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
......@@ -261,7 +304,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, self.num_choices])
self.check_loss_output(result)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, token_type_ids, input_mask,
......@@ -280,10 +322,18 @@ class BertModelTest(CommonTestCases.CommonModelTester):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_model(*config_and_inputs)
def test_bert_model_as_decoder(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_bert_model_as_decoder(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
def test_for_masked_lm_decoder(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
self.model_tester.create_and_check_bert_model_for_masked_lm_as_decoder(*config_and_inputs)
def test_for_multiple_choice(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
......@@ -308,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
@pytest.mark.slow
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......@@ -316,5 +366,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__":
unittest.main()
......@@ -27,19 +27,18 @@ import uuid
import unittest
import logging
import pytest
from transformers import is_torch_available
from .utils import require_torch, slow, torch_device
if is_torch_available():
import torch
import numpy as np
from transformers import (PretrainedConfig, PreTrainedModel,
from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
else:
pytestmark = pytest.mark.skip("Require Torch")
if sys.version_info[0] == 2:
import cPickle as pickle
......@@ -65,6 +64,7 @@ def _config_zero_init(config):
class CommonTestCases:
@require_torch
class CommonModelTester(unittest.TestCase):
model_tester = None
......@@ -79,6 +79,7 @@ class CommonTestCases:
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**inputs_dict)
......@@ -86,12 +87,13 @@ class CommonTestCases:
with TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(tmpdirname)
model.to(torch_device)
with torch.no_grad():
after_outputs = model(**inputs_dict)
# Make sure we don't have nans
out_1 = after_outputs[0].numpy()
out_2 = outputs[0].numpy()
out_1 = after_outputs[0].cpu().numpy()
out_2 = outputs[0].cpu().numpy()
out_1 = out_1[~np.isnan(out_1)]
out_2 = out_2[~np.isnan(out_2)]
max_diff = np.amax(np.abs(out_1 - out_2))
......@@ -113,6 +115,7 @@ class CommonTestCases:
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
self.assertEqual(first.ne(second).sum().item(), 0)
......@@ -125,6 +128,7 @@ class CommonTestCases:
config.output_attentions = True
config.output_hidden_states = False
model = model_class(config)
model.to(torch_device)
model.eval()
outputs = model(**inputs_dict)
attentions = outputs[-1]
......@@ -142,6 +146,7 @@ class CommonTestCases:
config.output_attentions = True
config.output_hidden_states = True
model = model_class(config)
model.to(torch_device)
model.eval()
outputs = model(**inputs_dict)
self.assertEqual(out_len+1, len(outputs))
......@@ -181,6 +186,7 @@ class CommonTestCases:
configs_no_init.torchscript = True
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
model.to(torch_device)
model.eval()
inputs = inputs_dict['input_ids'] # Let's keep only input_ids
......@@ -201,7 +207,10 @@ class CommonTestCases:
except ValueError:
self.fail("Couldn't load module.")
model.to(torch_device)
model.eval()
loaded_model.to(torch_device)
loaded_model.eval()
model_params = model.parameters()
......@@ -228,11 +237,12 @@ class CommonTestCases:
configs_no_init = _config_zero_init(config) # To be sure we have no Nan
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
model.to(torch_device)
model.eval()
# Prepare head_mask
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
head_mask[0, 0] = 0
head_mask[-1, :-1] = 0
head_mask.requires_grad_(requires_grad=True)
......@@ -282,6 +292,7 @@ class CommonTestCases:
config.output_attentions = True
config.output_hidden_states = False
model = model_class(config=config)
model.to(torch_device)
model.eval()
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-1: [0]}
......@@ -310,6 +321,7 @@ class CommonTestCases:
config.output_attentions = True
config.output_hidden_states = False
model = model_class(config=config)
model.to(torch_device)
model.eval()
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-1: [0]}
......@@ -319,6 +331,7 @@ class CommonTestCases:
os.makedirs(directory)
model.save_pretrained(directory)
model = model_class.from_pretrained(directory)
model.to(torch_device)
outputs = model(**inputs_dict)
attentions = outputs[-1]
......@@ -346,6 +359,7 @@ class CommonTestCases:
config.pruned_heads = heads_to_prune
model = model_class(config=config)
model.to(torch_device)
model.eval()
outputs = model(**inputs_dict)
......@@ -372,6 +386,7 @@ class CommonTestCases:
config.pruned_heads = heads_to_prune
model = model_class(config=config)
model.to(torch_device)
model.eval()
outputs = model(**inputs_dict)
......@@ -388,6 +403,7 @@ class CommonTestCases:
os.makedirs(directory)
model.save_pretrained(directory)
model = model_class.from_pretrained(directory)
model.to(torch_device)
shutil.rmtree(directory)
outputs = model(**inputs_dict)
......@@ -419,6 +435,7 @@ class CommonTestCases:
config.output_hidden_states = True
config.output_attentions = False
model = model_class(config)
model.to(torch_device)
model.eval()
outputs = model(**inputs_dict)
hidden_states = outputs[-1]
......@@ -463,6 +480,21 @@ class CommonTestCases:
self.assertTrue(models_equal)
def test_model_common_attributes(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(
model.get_input_embeddings(),
(torch.nn.Embedding, AdaptiveEmbedding)
)
model.set_input_embeddings(torch.nn.Embedding(10, 10))
x = model.get_output_embeddings()
self.assertTrue(
x is None or isinstance(x, torch.nn.Linear)
)
def test_tie_model_weights(self):
if not self.test_torchscript:
return
......@@ -477,11 +509,11 @@ class CommonTestCases:
return equal
for model_class in self.all_model_classes:
if not hasattr(model_class, 'tie_weights'):
continue
config.torchscript = True
model_not_tied = model_class(config)
if model_not_tied.get_output_embeddings() is None:
continue
params_not_tied = list(model_not_tied.parameters())
config_tied = copy.deepcopy(config)
......@@ -516,6 +548,20 @@ class CommonTestCases:
# self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
# self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict["input_ids"]
del inputs_dict["input_ids"]
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
wte = model.get_input_embeddings()
inputs_dict["inputs_embeds"] = wte(input_ids)
outputs = model(**inputs_dict)
class GPTModelTester(CommonModelTester):
......@@ -600,6 +646,7 @@ class CommonTestCases:
def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
model = self.base_model_class(config)
model.to(torch_device)
model.eval()
outputs = model(input_ids, position_ids, token_type_ids)
......@@ -615,6 +662,7 @@ class CommonTestCases:
def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
model = self.lm_head_model_class(config)
model.to(torch_device)
model.eval()
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
loss, lm_logits = outputs[:2]
......@@ -631,6 +679,7 @@ class CommonTestCases:
mc_labels, lm_labels, mc_token_ids):
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
outputs = model(input_ids)
presents = outputs[-1]
......@@ -643,6 +692,7 @@ class CommonTestCases:
def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids):
model = self.double_head_model_class(config)
model.to(torch_device)
model.eval()
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
token_type_ids=token_type_ids, position_ids=position_ids)
......@@ -688,6 +738,7 @@ class CommonTestCases:
config_and_inputs = self.prepare_config_and_inputs()
self.create_and_check_presents(*config_and_inputs)
@slow
def run_slow_tests(self):
self.create_and_check_model_from_pretrained()
......@@ -741,10 +792,28 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
for _ in range(total_dims):
values.append(rng.randint(0, vocab_size - 1))
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
def floats_tensor(shape, scale=1.0, rng=None, name=None):
"""Creates a random float32 tensor of the shape within the vocab size."""
if rng is None:
rng = global_rng
total_dims = 1
for dim in shape:
total_dims *= dim
values = []
for _ in range(total_dims):
values.append(rng.random() * scale)
return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
@require_torch
class ModelUtilsTest(unittest.TestCase):
@slow
def test_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......
......@@ -16,7 +16,6 @@ from __future__ import division
from __future__ import print_function
import unittest
import pytest
import shutil
import pdb
......@@ -25,13 +24,13 @@ from transformers import is_torch_available
if is_torch_available():
from transformers import (CTRLConfig, CTRLModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
CTRLLMHeadModel)
else:
pytestmark = pytest.mark.skip("Require Torch")
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
@require_torch
class CTRLModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (CTRLModel, CTRLLMHeadModel) if is_torch_available() else ()
......@@ -140,6 +139,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
def create_and_check_ctrl_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = CTRLModel(config=config)
model.to(torch_device)
model.eval()
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
......@@ -157,6 +157,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = CTRLLMHeadModel(config)
model.to(torch_device)
model.eval()
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
......@@ -202,7 +203,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
@pytest.mark.slow
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......
......@@ -17,20 +17,20 @@ from __future__ import division
from __future__ import print_function
import unittest
import pytest
from transformers import is_torch_available
if is_torch_available():
from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
DistilBertForTokenClassification,
DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
else:
pytestmark = pytest.mark.skip("Require Torch")
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
@require_torch
class DistilBertModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
......@@ -125,6 +125,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_distilbert_model(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = DistilBertModel(config=config)
model.to(torch_device)
model.eval()
(sequence_output,) = model(input_ids, input_mask)
(sequence_output,) = model(input_ids)
......@@ -138,6 +139,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_distilbert_for_masked_lm(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = DistilBertForMaskedLM(config=config)
model.to(torch_device)
model.eval()
loss, prediction_scores = model(input_ids, attention_mask=input_mask, masked_lm_labels=token_labels)
result = {
......@@ -151,6 +153,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = DistilBertForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
result = {
......@@ -169,6 +172,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_distilbert_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = DistilBertForSequenceClassification(config)
model.to(torch_device)
model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
result = {
......@@ -180,6 +184,22 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, self.num_labels])
self.check_loss_output(result)
def create_and_check_distilbert_for_token_classification(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = DistilBertForTokenClassification(config=config)
model.to(torch_device)
model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, labels=token_labels)
result = {
"loss": loss,
"logits": logits,
}
self.parent.assertListEqual(
list(result["logits"].size()),
[self.batch_size, self.seq_length, self.num_labels])
self.check_loss_output(result)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, input_mask, sequence_labels, token_labels, choice_labels) = config_and_inputs
......@@ -209,7 +229,11 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_distilbert_for_sequence_classification(*config_and_inputs)
# @pytest.mark.slow
def test_for_token_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
# @slow
# def test_model_from_pretrained(self):
# cache_dir = "/tmp/transformers_test/"
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......
# coding=utf-8
# Copyright 2018 The Hugging Face Inc. Team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import unittest
from transformers import is_torch_available
from .utils import require_torch, slow
if is_torch_available():
from transformers import BertModel, BertForMaskedLM, Model2Model
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
@require_torch
class EncoderDecoderModelTest(unittest.TestCase):
@slow
def test_model2model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = Model2Model.from_pretrained(model_name)
self.assertIsInstance(model.encoder, BertModel)
self.assertIsInstance(model.decoder, BertForMaskedLM)
self.assertEqual(model.decoder.config.is_decoder, True)
self.assertEqual(model.encoder.config.is_decoder, False)
def test_model2model_from_pretrained_not_bert(self):
logging.basicConfig(level=logging.INFO)
with self.assertRaises(ValueError):
_ = Model2Model.from_pretrained('roberta')
with self.assertRaises(ValueError):
_ = Model2Model.from_pretrained('distilbert')
with self.assertRaises(ValueError):
_ = Model2Model.from_pretrained('does-not-exist')
if __name__ == "__main__":
unittest.main()
......@@ -17,7 +17,6 @@ from __future__ import division
from __future__ import print_function
import unittest
import pytest
import shutil
from transformers import is_torch_available
......@@ -25,13 +24,13 @@ from transformers import is_torch_available
if is_torch_available():
from transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
GPT2LMHeadModel, GPT2DoubleHeadsModel)
else:
pytestmark = pytest.mark.skip("Require Torch")
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
@require_torch
class GPT2ModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel) if is_torch_available() else ()
......@@ -136,6 +135,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = GPT2Model(config=config)
model.to(torch_device)
model.eval()
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
......@@ -153,6 +153,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
def create_and_check_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
model = GPT2LMHeadModel(config)
model.to(torch_device)
model.eval()
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
......@@ -171,6 +172,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
def create_and_check_double_lm_head_model(self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args):
model = GPT2DoubleHeadsModel(config)
model.to(torch_device)
model.eval()
......@@ -235,7 +237,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
@pytest.mark.slow
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......
......@@ -17,7 +17,6 @@ from __future__ import division
from __future__ import print_function
import unittest
import pytest
import shutil
from transformers import is_torch_available
......@@ -25,13 +24,13 @@ from transformers import is_torch_available
if is_torch_available():
from transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
else:
pytestmark = pytest.mark.skip("Require Torch")
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
@require_torch
class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) if is_torch_available() else ()
......@@ -124,6 +123,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
model = OpenAIGPTModel(config=config)
model.to(torch_device)
model.eval()
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
......@@ -139,6 +139,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
model = OpenAIGPTLMHeadModel(config)
model.to(torch_device)
model.eval()
loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
......@@ -157,6 +158,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
model = OpenAIGPTDoubleHeadsModel(config)
model.to(torch_device)
model.eval()
loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
......@@ -203,7 +205,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
@pytest.mark.slow
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......
......@@ -18,21 +18,21 @@ from __future__ import print_function
import unittest
import shutil
import pytest
from transformers import is_torch_available
if is_torch_available():
import torch
from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
RobertaForSequenceClassification, RobertaForTokenClassification)
from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
else:
pytestmark = pytest.mark.skip("Require Torch")
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
@require_torch
class RobertaModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (RobertaForMaskedLM, RobertaModel) if is_torch_available() else ()
......@@ -128,6 +128,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
token_labels, choice_labels):
model = RobertaModel(config=config)
model.to(torch_device)
model.eval()
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
......@@ -145,6 +146,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
token_labels, choice_labels):
model = RobertaForMaskedLM(config=config)
model.to(torch_device)
model.eval()
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
result = {
......@@ -156,6 +158,23 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
[self.batch_size, self.seq_length, self.vocab_size])
self.check_loss_output(result)
def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask,
sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = RobertaForTokenClassification(config=config)
model.to(torch_device)
model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
labels=token_labels)
result = {
"loss": loss,
"logits": logits,
}
self.parent.assertListEqual(
list(result["logits"].size()),
[self.batch_size, self.seq_length, self.num_labels])
self.check_loss_output(result)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, token_type_ids, input_mask,
......@@ -178,7 +197,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
@pytest.mark.slow
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......@@ -190,10 +209,10 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
class RobertaModelIntegrationTest(unittest.TestCase):
@pytest.mark.slow
@slow
def test_inference_masked_lm(self):
model = RobertaForMaskedLM.from_pretrained('roberta-base')
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
output = model(input_ids)[0]
expected_shape = torch.Size((1, 11, 50265))
......@@ -211,10 +230,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
)
@pytest.mark.slow
@slow
def test_inference_no_head(self):
model = RobertaModel.from_pretrained('roberta-base')
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
output = model(input_ids)[0]
# compare the actual values for a slice.
......@@ -227,10 +246,10 @@ class RobertaModelIntegrationTest(unittest.TestCase):
torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
)
@pytest.mark.slow
@slow
def test_inference_classification_head(self):
model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
input_ids = torch.tensor([[ 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
output = model(input_ids)[0]
expected_shape = torch.Size((1, 3))
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
import sys
from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_tf, slow
from transformers import AlbertConfig, is_tf_available
if is_tf_available():
import tensorflow as tf
from transformers.modeling_tf_albert import (TFAlbertModel, TFAlbertForMaskedLM,
TFAlbertForSequenceClassification,
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
@require_tf
class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
all_model_classes = (
TFAlbertModel,
TFAlbertForMaskedLM,
TFAlbertForSequenceClassification
) if is_tf_available() else ()
class TFAlbertModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
embedding_size=16,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor(
[self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = ids_tensor(
[self.batch_size, self.seq_length], vocab_size=2)
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor(
[self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor(
[self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor(
[self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = AlbertConfig(
vocab_size_or_config_json_file=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range)
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = TFAlbertModel(config=config)
# inputs = {'input_ids': input_ids,
# 'attention_mask': input_mask,
# 'token_type_ids': token_type_ids}
# sequence_output, pooled_output = model(**inputs)
inputs = {'input_ids': input_ids,
'attention_mask': input_mask,
'token_type_ids': token_type_ids}
sequence_output, pooled_output = model(inputs)
inputs = [input_ids, input_mask]
sequence_output, pooled_output = model(inputs)
sequence_output, pooled_output = model(input_ids)
result = {
"sequence_output": sequence_output.numpy(),
"pooled_output": pooled_output.numpy(),
}
self.parent.assertListEqual(
list(result["sequence_output"].shape),
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertListEqual(list(result["pooled_output"].shape), [
self.batch_size, self.hidden_size])
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = TFAlbertForMaskedLM(config=config)
inputs = {'input_ids': input_ids,
'attention_mask': input_mask,
'token_type_ids': token_type_ids}
prediction_scores, = model(inputs)
result = {
"prediction_scores": prediction_scores.numpy(),
}
self.parent.assertListEqual(
list(result["prediction_scores"].shape),
[self.batch_size, self.seq_length, self.vocab_size])
def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = TFAlbertForSequenceClassification(config=config)
inputs = {'input_ids': input_ids,
'attention_mask': input_mask,
'token_type_ids': token_type_ids}
logits, = model(inputs)
result = {
"logits": logits.numpy(),
}
self.parent.assertListEqual(
list(result["logits"].shape),
[self.batch_size, self.num_labels])
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, token_type_ids, input_mask,
sequence_labels, token_labels, choice_labels) = config_and_inputs
inputs_dict = {'input_ids': input_ids,
'token_type_ids': token_type_ids, 'attention_mask': input_mask}
return config, inputs_dict
def setUp(self):
self.model_tester = TFAlbertModelTest.TFAlbertModelTester(self)
self.config_tester = ConfigTester(
self, config_class=AlbertConfig, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_albert_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_model(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_masked_lm(
*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_sequence_classification(
*config_and_inputs)
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
# for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
for model_name in ['albert-base-uncased']:
model = TFAlbertModel.from_pretrained(
model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__":
unittest.main()
......@@ -18,11 +18,12 @@ from __future__ import print_function
import unittest
import shutil
import pytest
import logging
from transformers import is_tf_available
from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
if is_tf_available():
from transformers import (AutoConfig, BertConfig,
TFAutoModel, TFBertModel,
......@@ -33,11 +34,11 @@ if is_tf_available():
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
else:
pytestmark = pytest.mark.skip("Require TensorFlow")
@require_tf
class TFAutoModelTest(unittest.TestCase):
@slow
def test_model_from_pretrained(self):
import h5py
self.assertTrue(h5py.version.hdf5_version.startswith("1.10"))
......@@ -53,6 +54,7 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertModel)
@slow
def test_lmhead_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......@@ -65,6 +67,7 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForMaskedLM)
@slow
def test_sequence_classification_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......@@ -77,6 +80,7 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForSequenceClassification)
@slow
def test_question_answering_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO)
# for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......@@ -89,6 +93,11 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForQuestionAnswering)
def test_from_pretrained_identifier(self):
logging.basicConfig(level=logging.INFO)
model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, force_download=True)
self.assertIsInstance(model, TFBertForMaskedLM)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment