"vscode:/vscode.git/clone" did not exist on "38630e7a8747429517c9a82129ac32638cebb052"
Commit 0558c9cb authored by thomwolf's avatar thomwolf
Browse files

Merge branch 'master' into t5

parents 608a8f5b e57d00ee
...@@ -118,6 +118,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a ...@@ -118,6 +118,9 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
new_key = key.replace('gamma', 'weight') new_key = key.replace('gamma', 'weight')
if 'beta' in key: if 'beta' in key:
new_key = key.replace('beta', 'bias') new_key = key.replace('beta', 'bias')
# DialoGPT format
if key == 'lm_head.decoder.weight':
new_key = 'lm_head.weight'
if new_key: if new_key:
old_keys.append(key) old_keys.append(key)
new_keys.append(new_key) new_keys.append(new_key)
......
...@@ -24,7 +24,7 @@ import numpy as np ...@@ -24,7 +24,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from .configuration_roberta import RobertaConfig from .configuration_roberta import RobertaConfig
from .modeling_tf_utils import TFPreTrainedModel, get_initializer from .modeling_tf_utils import TFPreTrainedModel, get_initializer, shape_list
from .file_utils import add_start_docstrings from .file_utils import add_start_docstrings
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
...@@ -48,13 +48,17 @@ class TFRobertaEmbeddings(TFBertEmbeddings): ...@@ -48,13 +48,17 @@ class TFRobertaEmbeddings(TFBertEmbeddings):
def _embedding(self, inputs, training=False): def _embedding(self, inputs, training=False):
"""Applies embedding based on inputs tensor.""" """Applies embedding based on inputs tensor."""
input_ids, position_ids, token_type_ids = inputs input_ids, position_ids, token_type_ids, inputs_embeds = inputs
if input_ids is not None:
seq_length = shape_list(input_ids)[1]
else:
seq_length = shape_list(inputs_embeds)[1]
seq_length = tf.shape(input_ids)[1]
if position_ids is None: if position_ids is None:
position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :] position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :]
return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids], training=training) return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
class TFRobertaMainLayer(TFBertMainLayer): class TFRobertaMainLayer(TFBertMainLayer):
...@@ -65,6 +69,9 @@ class TFRobertaMainLayer(TFBertMainLayer): ...@@ -65,6 +69,9 @@ class TFRobertaMainLayer(TFBertMainLayer):
super(TFRobertaMainLayer, self).__init__(config, **kwargs) super(TFRobertaMainLayer, self).__init__(config, **kwargs)
self.embeddings = TFRobertaEmbeddings(config, name='embeddings') self.embeddings = TFRobertaEmbeddings(config, name='embeddings')
def get_input_embeddings(self):
return self.embeddings
class TFRobertaPreTrainedModel(TFPreTrainedModel): class TFRobertaPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and """ An abstract class to handle weights initialization and
...@@ -157,6 +164,10 @@ ROBERTA_INPUTS_DOCSTRING = r""" ...@@ -157,6 +164,10 @@ ROBERTA_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
...@@ -276,6 +287,9 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): ...@@ -276,6 +287,9 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
self.roberta = TFRobertaMainLayer(config, name="roberta") self.roberta = TFRobertaMainLayer(config, name="roberta")
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head") self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
def get_output_embeddings(self):
return self.lm_head.decoder
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
outputs = self.roberta(inputs, **kwargs) outputs = self.roberta(inputs, **kwargs)
......
...@@ -337,7 +337,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer): ...@@ -337,7 +337,7 @@ class TFAdaptiveEmbedding(tf.keras.layers.Layer):
emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i]) emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64) mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(tf.shape(emb_flat), dtype=tf.int64)) emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(shape_list(emb_flat), dtype=tf.int64))
embed_shape = shape_list(inp) + [self.d_proj] embed_shape = shape_list(inp) + [self.d_proj]
embed = tf.reshape(emb_flat, embed_shape) embed = tf.reshape(emb_flat, embed_shape)
...@@ -413,6 +413,9 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -413,6 +413,9 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
name='r_r_bias') name='r_r_bias')
super(TFTransfoXLMainLayer, self).build(input_shape) super(TFTransfoXLMainLayer, self).build(input_shape)
def get_input_embeddings(self):
return self.word_emb
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
return self.word_emb return self.word_emb
...@@ -427,11 +430,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -427,11 +430,11 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
def _prune_heads(self, heads): def _prune_heads(self, heads):
raise NotImplementedError raise NotImplementedError
def init_mems(self, data): def init_mems(self, bsz):
if self.mem_len > 0: if self.mem_len > 0:
mems = [] mems = []
for i in range(self.n_layer): for i in range(self.n_layer):
empty = tf.zeros([self.mem_len, shape_list(data)[1], self.d_model]) empty = tf.zeros([self.mem_len, bsz, self.d_model])
mems.append(empty) mems.append(empty)
return mems return mems
...@@ -461,28 +464,37 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -461,28 +464,37 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
return new_mems return new_mems
def call(self, inputs, mems=None, head_mask=None, training=False): def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
mems = inputs[1] if len(inputs) > 1 else mems mems = inputs[1] if len(inputs) > 1 else mems
head_mask = inputs[2] if len(inputs) > 2 else head_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask
assert len(inputs) <= 3, "Too many inputs." inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
assert len(inputs) <= 4, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
mems = inputs.get('mems', mems) mems = inputs.get('mems', mems)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 3, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 4, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz] # so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = tf.transpose(input_ids, perm=(1, 0)) if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_ids = tf.transpose(input_ids, perm=(1, 0))
qlen, bsz = shape_list(input_ids)
elif inputs_embeds is not None:
inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
qlen, bsz = shape_list(inputs_embeds)[:2]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if mems is None: if mems is None:
mems = self.init_mems(input_ids) mems = self.init_mems(bsz)
qlen, bsz = shape_list(input_ids)
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
...@@ -494,7 +506,10 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): ...@@ -494,7 +506,10 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
else: else:
head_mask = [None] * self.n_layer head_mask = [None] * self.n_layer
word_emb = self.word_emb(input_ids) if inputs_embeds is not None:
word_emb = inputs_embeds
else:
word_emb = self.word_emb(input_ids)
mlen = shape_list(mems[0])[0] if mems is not None else 0 mlen = shape_list(mems[0])[0] if mems is not None else 0
klen = mlen + qlen klen = mlen + qlen
...@@ -626,6 +641,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -626,6 +641,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
...@@ -716,28 +735,33 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): ...@@ -716,28 +735,33 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
def reset_length(self, tgt_len, ext_len, mem_len): def reset_length(self, tgt_len, ext_len, mem_len):
self.transformer.reset_length(tgt_len, ext_len, mem_len) self.transformer.reset_length(tgt_len, ext_len, mem_len)
def init_mems(self, data): def init_mems(self, bsz):
return self.transformer.init_mems(data) return self.transformer.init_mems(bsz)
def call(self, inputs, mems=None, head_mask=None, labels=None, training=False): def call(self, inputs, mems=None, head_mask=None, inputs_embeds=None, labels=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
mems = inputs[1] if len(inputs) > 1 else mems mems = inputs[1] if len(inputs) > 1 else mems
head_mask = inputs[2] if len(inputs) > 2 else head_mask head_mask = inputs[2] if len(inputs) > 2 else head_mask
labels = inputs[3] if len(inputs) > 3 else labels inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
assert len(inputs) <= 4, "Too many inputs." labels = inputs[4] if len(inputs) > 4 else labels
assert len(inputs) <= 5, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
mems = inputs.get('mems', mems) mems = inputs.get('mems', mems)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
labels = inputs.get('labels', labels) labels = inputs.get('labels', labels)
assert len(inputs) <= 4, "Too many inputs." assert len(inputs) <= 5, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
bsz, tgt_len = shape_list(input_ids)[:2] if input_ids is not None:
bsz, tgt_len = shape_list(input_ids)[:2]
else:
bsz, tgt_len = shape_list(inputs_embeds)[:2]
transformer_outputs = self.transformer([input_ids, mems, head_mask], training=training) transformer_outputs = self.transformer([input_ids, mems, head_mask, inputs_embeds], training=training)
last_hidden = transformer_outputs[0] last_hidden = transformer_outputs[0]
pred_hid = last_hidden[:, -tgt_len:] pred_hid = last_hidden[:, -tgt_len:]
......
...@@ -105,7 +105,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): ...@@ -105,7 +105,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
@staticmethod @staticmethod
def _gather_logprob(logprob, target): def _gather_logprob(logprob, target):
lp_size = tf.shape(logprob) lp_size = shape_list(logprob)
r = tf.range(lp_size[0]) r = tf.range(lp_size[0])
idx = tf.stack([r, target], 1) idx = tf.stack([r, target], 1)
return tf.gather_nd(logprob, idx) return tf.gather_nd(logprob, idx)
...@@ -159,7 +159,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): ...@@ -159,7 +159,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target) cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1] cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
if target is not None: if target is not None:
loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(tf.shape(loss), dtype=tf.int64)) loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(shape_list(loss), dtype=tf.int64))
out = tf.concat(out, axis=-1) out = tf.concat(out, axis=-1)
if target is not None: if target is not None:
......
...@@ -35,7 +35,7 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -35,7 +35,7 @@ class TFPreTrainedModel(tf.keras.Model):
r""" Base class for all TF models. r""" Base class for all TF models.
:class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes): Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
...@@ -51,7 +51,15 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -51,7 +51,15 @@ class TFPreTrainedModel(tf.keras.Model):
config_class = None config_class = None
pretrained_model_archive_map = {} pretrained_model_archive_map = {}
base_model_prefix = "" base_model_prefix = ""
dummy_inputs = tf.constant(DUMMY_INPUTS) # dummy inputs to build the network
@property
def dummy_inputs(self):
""" Dummy inputs to build the network.
Returns:
tf.Tensor with dummy inputs
"""
return tf.constant(DUMMY_INPUTS)
def __init__(self, config, *inputs, **kwargs): def __init__(self, config, *inputs, **kwargs):
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs) super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
...@@ -65,6 +73,21 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -65,6 +73,21 @@ class TFPreTrainedModel(tf.keras.Model):
# Save config in model # Save config in model
self.config = config self.config = config
def get_input_embeddings(self):
""" Get model's input embeddings
"""
base_model = getattr(self, self.base_model_prefix, self)
if base_model is not self:
return base_model.get_input_embeddings()
else:
raise NotImplementedError
def get_output_embeddings(self):
""" Get model's output embeddings
Return None if the model doesn't have output embeddings
"""
return None # Overwrite for models with output embeddings
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
""" Build a resized Embedding Variable from a provided token Embedding Module. """ Build a resized Embedding Variable from a provided token Embedding Module.
Increasing the size will add newly initialized vectors at the end Increasing the size will add newly initialized vectors at the end
...@@ -176,6 +199,9 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -176,6 +199,9 @@ class TFPreTrainedModel(tf.keras.Model):
force_download: (`optional`) boolean, default False: force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists. Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None: proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request. The proxies are used on each request.
...@@ -201,6 +227,7 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -201,6 +227,7 @@ class TFPreTrainedModel(tf.keras.Model):
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
from_pt = kwargs.pop('from_pt', False) from_pt = kwargs.pop('from_pt', False)
force_download = kwargs.pop('force_download', False) force_download = kwargs.pop('force_download', False)
resume_download = kwargs.pop('resume_download', False)
proxies = kwargs.pop('proxies', None) proxies = kwargs.pop('proxies', None)
# Load config # Load config
...@@ -209,6 +236,7 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -209,6 +236,7 @@ class TFPreTrainedModel(tf.keras.Model):
pretrained_model_name_or_path, *model_args, pretrained_model_name_or_path, *model_args,
cache_dir=cache_dir, return_unused_kwargs=True, cache_dir=cache_dir, return_unused_kwargs=True,
force_download=force_download, force_download=force_download,
resume_download=resume_download,
**kwargs **kwargs
) )
else: else:
...@@ -236,7 +264,8 @@ class TFPreTrainedModel(tf.keras.Model): ...@@ -236,7 +264,8 @@ class TFPreTrainedModel(tf.keras.Model):
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
resume_download=resume_download, proxies=proxies)
except EnvironmentError as e: except EnvironmentError as e:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map: if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
logger.error( logger.error(
...@@ -439,7 +468,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): ...@@ -439,7 +468,7 @@ class TFSequenceSummary(tf.keras.layers.Layer):
elif self.summary_type == 'first': elif self.summary_type == 'first':
output = hidden_states[:, 0] output = hidden_states[:, 0]
elif self.summary_type == 'mean': elif self.summary_type == 'mean':
output = tf.mean(hidden_states, axis=1) output = tf.reduce_mean(hidden_states, axis=1)
elif self.summary_type == 'cls_index': elif self.summary_type == 'cls_index':
hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims] hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims]
if cls_index is None: if cls_index is None:
...@@ -477,10 +506,10 @@ def shape_list(x): ...@@ -477,10 +506,10 @@ def shape_list(x):
return [dynamic[i] if s is None else s for i, s in enumerate(static)] return [dynamic[i] if s is None else s for i, s in enumerate(static)]
def get_initializer(initializer_range=0.02): def get_initializer(initializer_range=0.02):
"""Creates a `tf.initializers.truncated_normal` with the given range. """Creates a `tf.initializers.truncated_normal` with the given range.
Args: Args:
initializer_range: float, initializer range for stddev. initializer_range: float, initializer range for stddev.
Returns: Returns:
TruncatedNormal initializer with stddev = `initializer_range`. TruncatedNormal initializer with stddev = `initializer_range`.
""" """
return tf.keras.initializers.TruncatedNormal(stddev=initializer_range) return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
...@@ -277,6 +277,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -277,6 +277,9 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
self.prune_heads({int(layer): list(map(int, heads))}) self.prune_heads({int(layer): list(map(int, heads))})
def get_input_embeddings(self):
return self.embeddings
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError raise NotImplementedError
...@@ -288,7 +291,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -288,7 +291,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
raise NotImplementedError raise NotImplementedError
def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None, def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None,
position_ids=None, lengths=None, cache=None, head_mask=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None,
training=False): # removed: src_enc=None, src_len=None training=False): # removed: src_enc=None, src_len=None
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
...@@ -299,7 +302,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -299,7 +302,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
lengths = inputs[5] if len(inputs) > 5 else lengths lengths = inputs[5] if len(inputs) > 5 else lengths
cache = inputs[6] if len(inputs) > 6 else cache cache = inputs[6] if len(inputs) > 6 else cache
head_mask = inputs[7] if len(inputs) > 7 else head_mask head_mask = inputs[7] if len(inputs) > 7 else head_mask
assert len(inputs) <= 8, "Too many inputs." inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask) attention_mask = inputs.get('attention_mask', attention_mask)
...@@ -309,16 +313,28 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -309,16 +313,28 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
lengths = inputs.get('lengths', lengths) lengths = inputs.get('lengths', lengths)
cache = inputs.get('cache', cache) cache = inputs.get('cache', cache)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 8, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
bs, slen = shape_list(input_ids)
elif inputs_embeds is not None:
bs, slen = shape_list(inputs_embeds)[:2]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if lengths is None: if lengths is None:
lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) if input_ids is not None:
lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1)
else:
lengths = tf.convert_to_tensor([slen]*bs, tf.int32)
# mask = input_ids != self.pad_index # mask = input_ids != self.pad_index
# check inputs # check inputs
bs, slen = shape_list(input_ids)
# assert shape_list(lengths)[0] == bs # assert shape_list(lengths)[0] == bs
tf.debugging.assert_equal(shape_list(lengths)[0], bs) tf.debugging.assert_equal(shape_list(lengths)[0], bs)
# assert lengths.max().item() <= slen # assert lengths.max().item() <= slen
...@@ -358,7 +374,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -358,7 +374,7 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
head_mask = [None] * self.n_layers head_mask = [None] * self.n_layers
# do not recompute cached elements # do not recompute cached elements
if cache is not None: if cache is not None and input_ids is not None:
_slen = slen - cache['slen'] _slen = slen - cache['slen']
input_ids = input_ids[:, -_slen:] input_ids = input_ids[:, -_slen:]
position_ids = position_ids[:, -_slen:] position_ids = position_ids[:, -_slen:]
...@@ -368,8 +384,10 @@ class TFXLMMainLayer(tf.keras.layers.Layer): ...@@ -368,8 +384,10 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
attn_mask = attn_mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:]
# embeddings # embeddings
tensor = self.embeddings(input_ids) if inputs_embeds is None:
tensor = tensor + self.position_embeddings(position_ids) inputs_embeds = self.embeddings(input_ids)
tensor = inputs_embeds + self.position_embeddings(position_ids)
if langs is not None and self.use_lang_emb: if langs is not None and self.use_lang_emb:
tensor = tensor + self.lang_embeddings(langs) tensor = tensor + self.lang_embeddings(langs)
if token_type_ids is not None: if token_type_ids is not None:
...@@ -530,6 +548,10 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -530,6 +548,10 @@ XLM_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
...@@ -637,6 +659,8 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): ...@@ -637,6 +659,8 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
self.transformer = TFXLMMainLayer(config, name='transformer') self.transformer = TFXLMMainLayer(config, name='transformer')
self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj') self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj')
def get_output_embeddings(self):
return self.pred_layer.input_embeddings
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
......
...@@ -112,8 +112,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -112,8 +112,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
def prune_heads(self, heads): def prune_heads(self, heads):
raise NotImplementedError raise NotImplementedError
@staticmethod def rel_shift(self, x, klen=-1):
def rel_shift(x, klen=-1):
"""perform relative shift to form the relative attention score.""" """perform relative shift to form the relative attention score."""
x_size = shape_list(x) x_size = shape_list(x)
...@@ -135,7 +134,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -135,7 +134,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
# position based attention score # position based attention score
bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r) bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
bd = self.rel_shift(bd, klen=ac.shape[1]) bd = self.rel_shift(bd, klen=shape_list(ac)[1])
# segment based attention score # segment based attention score
if seg_mat is None: if seg_mat is None:
...@@ -192,7 +191,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -192,7 +191,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
if g is not None: if g is not None:
###### Two-stream attention with relative positional encoding. ###### Two-stream attention with relative positional encoding.
# content based attention score # content based attention score
if mems is not None and mems.shape.ndims > 1: if mems is not None and len(shape_list(mems)) > 1:
cat = tf.concat([mems, h], axis=0) cat = tf.concat([mems, h], axis=0)
else: else:
cat = h cat = h
...@@ -252,7 +251,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): ...@@ -252,7 +251,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer):
else: else:
###### Multi-head attention with relative positional encoding ###### Multi-head attention with relative positional encoding
if mems is not None and mems.shape.ndims > 1: if mems is not None and len(shape_list(mems)) > 1:
cat = tf.concat([mems, h], axis=0) cat = tf.concat([mems, h], axis=0)
else: else:
cat = h cat = h
...@@ -371,6 +370,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -371,6 +370,9 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)] self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
self.dropout = tf.keras.layers.Dropout(config.dropout) self.dropout = tf.keras.layers.Dropout(config.dropout)
def get_input_embeddings(self):
return self.word_embedding
def build(self, input_shape): def build(self, input_shape):
initializer = get_initializer(self.initializer_range) initializer = get_initializer(self.initializer_range)
self.mask_emb = self.add_weight(shape=(1, 1, self.d_model), self.mask_emb = self.add_weight(shape=(1, 1, self.d_model),
...@@ -484,7 +486,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -484,7 +486,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
return pos_emb return pos_emb
def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, training=False): token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, training=False):
if isinstance(inputs, (tuple, list)): if isinstance(inputs, (tuple, list)):
input_ids = inputs[0] input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
...@@ -494,7 +496,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -494,7 +496,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids
input_mask = inputs[6] if len(inputs) > 6 else input_mask input_mask = inputs[6] if len(inputs) > 6 else input_mask
head_mask = inputs[7] if len(inputs) > 7 else head_mask head_mask = inputs[7] if len(inputs) > 7 else head_mask
assert len(inputs) <= 8, "Too many inputs." inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds
assert len(inputs) <= 9, "Too many inputs."
elif isinstance(inputs, dict): elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids') input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask) attention_mask = inputs.get('attention_mask', attention_mask)
...@@ -504,7 +507,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -504,7 +507,8 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
token_type_ids = inputs.get('token_type_ids', token_type_ids) token_type_ids = inputs.get('token_type_ids', token_type_ids)
input_mask = inputs.get('input_mask', input_mask) input_mask = inputs.get('input_mask', input_mask)
head_mask = inputs.get('head_mask', head_mask) head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 8, "Too many inputs." inputs_embeds = inputs.get('inputs_embeds', inputs_embeds)
assert len(inputs) <= 9, "Too many inputs."
else: else:
input_ids = inputs input_ids = inputs
...@@ -512,14 +516,23 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -512,14 +516,23 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
# but we want a unified interface in the library with the batch size on the first dimension # but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end # so we move here the first dimension (batch) to the end
input_ids = tf.transpose(input_ids, perm=(1, 0)) if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_ids = tf.transpose(input_ids, perm=(1, 0))
qlen, bsz = shape_list(input_ids)[:2]
elif inputs_embeds is not None:
inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
qlen, bsz = shape_list(inputs_embeds)[:2]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None
input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None
attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None
perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None
target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None
qlen, bsz = shape_list(input_ids)[:2]
mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0 mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0
klen = mlen + qlen klen = mlen + qlen
...@@ -551,7 +564,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -551,7 +564,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
if data_mask is not None: if data_mask is not None:
# all mems can be attended to # all mems can be attended to
mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz], mems_mask = tf.zeros([shape_list(data_mask)[0], mlen, bsz],
dtype=dtype_float) dtype=dtype_float)
data_mask = tf.concat([mems_mask, data_mask], axis=1) data_mask = tf.concat([mems_mask, data_mask], axis=1)
if attn_mask is None: if attn_mask is None:
...@@ -570,10 +583,13 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): ...@@ -570,10 +583,13 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
non_tgt_mask = None non_tgt_mask = None
##### Word embeddings and prepare h & g hidden states ##### Word embeddings and prepare h & g hidden states
word_emb_k = self.word_embedding(input_ids) if inputs_embeds is not None:
word_emb_k = inputs_embeds
else:
word_emb_k = self.word_embedding(input_ids)
output_h = self.dropout(word_emb_k, training=training) output_h = self.dropout(word_emb_k, training=training)
if target_mapping is not None: if target_mapping is not None:
word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1]) word_emb_q = tf.tile(self.mask_emb, [shape_list(target_mapping)[0], bsz, 1])
# else: # We removed the inp_q input which was same as target mapping # else: # We removed the inp_q input which was same as target mapping
# inp_q_ext = inp_q[:, :, None] # inp_q_ext = inp_q[:, :, None]
# word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
...@@ -762,6 +778,10 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -762,6 +778,10 @@ XLNET_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
...@@ -850,6 +870,9 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): ...@@ -850,6 +870,9 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
self.transformer = TFXLNetMainLayer(config, name='transformer') self.transformer = TFXLNetMainLayer(config, name='transformer')
self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss') self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss')
def get_output_embeddings(self):
return self.lm_loss.input_embeddings
def call(self, inputs, **kwargs): def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs) transformer_outputs = self.transformer(inputs, **kwargs)
hidden_state = transformer_outputs[0] hidden_state = transformer_outputs[0]
...@@ -915,6 +938,59 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): ...@@ -915,6 +938,59 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
return outputs # return logits, (mems), (hidden states), (attentions) return outputs # return logits, (mems), (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class TFXLNetForTokenClassification(TFXLNetPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
Classification scores (before SoftMax).
**mems**: (`optional`, returned when ``config.mem_len > 0``)
list of ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForTokenClassification
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLNetForTokenClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFXLNetMainLayer(config, name='transformer')
self.classifier = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='classifier')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
output = transformer_outputs[0]
logits = self.classifier(output)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # return logits, (mems), (hidden states), (attentions)
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
# the hidden-states output to compute `span start logits` and `span end logits`). """, # the hidden-states output to compute `span start logits` and `span end logits`). """,
# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) # XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
......
...@@ -553,6 +553,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r""" ...@@ -553,6 +553,10 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", @add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
...@@ -657,12 +661,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -657,12 +661,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
logger.info("Head pruning is not implemented for Transformer-XL model") logger.info("Head pruning is not implemented for Transformer-XL model")
pass pass
def init_mems(self, data): def init_mems(self, bsz):
if self.mem_len > 0: if self.mem_len > 0:
mems = [] mems = []
param = next(self.parameters()) param = next(self.parameters())
for i in range(self.n_layer): for i in range(self.n_layer):
empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model, empty = torch.zeros(self.mem_len, bsz, self.config.d_model,
dtype=param.dtype, device=param.device) dtype=param.dtype, device=param.device)
mems.append(empty) mems.append(empty)
...@@ -693,15 +697,22 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -693,15 +697,22 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return new_mems return new_mems
def forward(self, input_ids, mems=None, head_mask=None): def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None):
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz] # so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = input_ids.transpose(0, 1).contiguous() if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_ids = input_ids.transpose(0, 1).contiguous()
qlen, bsz = input_ids.size()
elif inputs_embeds is not None:
inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if mems is None: if mems is None:
mems = self.init_mems(input_ids) mems = self.init_mems(bsz)
qlen, bsz = input_ids.size()
# Prepare head mask if needed # Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head # 1.0 in head_mask indicate we keep the head
...@@ -718,7 +729,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -718,7 +729,10 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
else: else:
head_mask = [None] * self.n_layer head_mask = [None] * self.n_layer
word_emb = self.word_emb(input_ids) if inputs_embeds is not None:
word_emb = inputs_embeds
else:
word_emb = self.word_emb(input_ids)
mlen = mems[0].size(0) if mems is not None else 0 mlen = mems[0].size(0) if mems is not None else 0
klen = mlen + qlen klen = mlen + qlen
...@@ -860,14 +874,18 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): ...@@ -860,14 +874,18 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
def reset_length(self, tgt_len, ext_len, mem_len): def reset_length(self, tgt_len, ext_len, mem_len):
self.transformer.reset_length(tgt_len, ext_len, mem_len) self.transformer.reset_length(tgt_len, ext_len, mem_len)
def init_mems(self, data): def init_mems(self, bsz):
return self.transformer.init_mems(data) return self.transformer.init_mems(bsz)
def forward(self, input_ids, mems=None, head_mask=None, labels=None): def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None, labels=None):
bsz = input_ids.size(0) if input_ids is not None:
tgt_len = input_ids.size(1) bsz, tgt_len = input_ids.size(0), input_ids.size(1)
elif inputs_embeds is not None:
bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1)
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask) transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask, inputs_embeds=inputs_embeds)
last_hidden = transformer_outputs[0] last_hidden = transformer_outputs[0]
pred_hid = last_hidden[:, -tgt_len:] pred_hid = last_hidden[:, -tgt_len:]
......
...@@ -53,7 +53,7 @@ class PreTrainedModel(nn.Module): ...@@ -53,7 +53,7 @@ class PreTrainedModel(nn.Module):
r""" Base class for all models. r""" Base class for all models.
:class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes): Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
...@@ -237,7 +237,7 @@ class PreTrainedModel(nn.Module): ...@@ -237,7 +237,7 @@ class PreTrainedModel(nn.Module):
""" """
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
# Only save the model it-self if we are using distributed training # Only save the model itself if we are using distributed training
model_to_save = self.module if hasattr(self, 'module') else self model_to_save = self.module if hasattr(self, 'module') else self
# Save configuration file # Save configuration file
...@@ -290,6 +290,9 @@ class PreTrainedModel(nn.Module): ...@@ -290,6 +290,9 @@ class PreTrainedModel(nn.Module):
force_download: (`optional`) boolean, default False: force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists. Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
resume_download: (`optional`) boolean, default False:
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
proxies: (`optional`) dict, default None: proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request. The proxies are used on each request.
...@@ -314,11 +317,16 @@ class PreTrainedModel(nn.Module): ...@@ -314,11 +317,16 @@ class PreTrainedModel(nn.Module):
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
""" """
if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
"https://github.com/google-research/google-research/issues/119 for more information.")
config = kwargs.pop('config', None) config = kwargs.pop('config', None)
state_dict = kwargs.pop('state_dict', None) state_dict = kwargs.pop('state_dict', None)
cache_dir = kwargs.pop('cache_dir', None) cache_dir = kwargs.pop('cache_dir', None)
from_tf = kwargs.pop('from_tf', False) from_tf = kwargs.pop('from_tf', False)
force_download = kwargs.pop('force_download', False) force_download = kwargs.pop('force_download', False)
resume_download = kwargs.pop('resume_download', False)
proxies = kwargs.pop('proxies', None) proxies = kwargs.pop('proxies', None)
output_loading_info = kwargs.pop('output_loading_info', False) output_loading_info = kwargs.pop('output_loading_info', False)
...@@ -328,6 +336,7 @@ class PreTrainedModel(nn.Module): ...@@ -328,6 +336,7 @@ class PreTrainedModel(nn.Module):
pretrained_model_name_or_path, *model_args, pretrained_model_name_or_path, *model_args,
cache_dir=cache_dir, return_unused_kwargs=True, cache_dir=cache_dir, return_unused_kwargs=True,
force_download=force_download, force_download=force_download,
resume_download=resume_download,
proxies=proxies, proxies=proxies,
**kwargs **kwargs
) )
...@@ -360,7 +369,8 @@ class PreTrainedModel(nn.Module): ...@@ -360,7 +369,8 @@ class PreTrainedModel(nn.Module):
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download,
proxies=proxies, resume_download=resume_download)
except EnvironmentError: except EnvironmentError:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map: if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
msg = "Couldn't reach server at '{}' to download pretrained weights.".format( msg = "Couldn't reach server at '{}' to download pretrained weights.".format(
...@@ -416,6 +426,8 @@ class PreTrainedModel(nn.Module): ...@@ -416,6 +426,8 @@ class PreTrainedModel(nn.Module):
new_key = key.replace('gamma', 'weight') new_key = key.replace('gamma', 'weight')
if 'beta' in key: if 'beta' in key:
new_key = key.replace('beta', 'bias') new_key = key.replace('beta', 'bias')
if key == 'lm_head.decoder.weight':
new_key = 'lm_head.weight'
if new_key: if new_key:
old_keys.append(key) old_keys.append(key)
new_keys.append(new_key) new_keys.append(new_key)
...@@ -726,7 +738,7 @@ class SequenceSummary(nn.Module): ...@@ -726,7 +738,7 @@ class SequenceSummary(nn.Module):
def __init__(self, config): def __init__(self, config):
super(SequenceSummary, self).__init__() super(SequenceSummary, self).__init__()
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last' self.summary_type = config.summary_type if hasattr(config, 'summary_type') else 'last'
if self.summary_type == 'attn': if self.summary_type == 'attn':
# We should use a standard multi-head attention module with absolute positional embedding for that. # We should use a standard multi-head attention module with absolute positional embedding for that.
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
......
...@@ -311,6 +311,10 @@ XLM_INPUTS_DOCSTRING = r""" ...@@ -311,6 +311,10 @@ XLM_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.", @add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.",
...@@ -421,14 +425,21 @@ class XLMModel(XLMPreTrainedModel): ...@@ -421,14 +425,21 @@ class XLMModel(XLMPreTrainedModel):
for layer, heads in heads_to_prune.items(): for layer, heads in heads_to_prune.items():
self.attentions[layer].prune_heads(heads) self.attentions[layer].prune_heads(heads)
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None): # removed: src_enc=None, src_len=None lengths=None, cache=None, head_mask=None, inputs_embeds=None): # removed: src_enc=None, src_len=None
if input_ids is not None:
bs, slen = input_ids.size()
else:
bs, slen = inputs_embeds.size()[:-1]
if lengths is None: if lengths is None:
lengths = (input_ids != self.pad_index).sum(dim=1).long() if input_ids is not None:
lengths = (input_ids != self.pad_index).sum(dim=1).long()
else:
lengths = torch.LongTensor([slen]*bs)
# mask = input_ids != self.pad_index # mask = input_ids != self.pad_index
# check inputs # check inputs
bs, slen = input_ids.size()
assert lengths.size(0) == bs assert lengths.size(0) == bs
assert lengths.max().item() <= slen assert lengths.max().item() <= slen
# input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0
...@@ -442,10 +453,12 @@ class XLMModel(XLMPreTrainedModel): ...@@ -442,10 +453,12 @@ class XLMModel(XLMPreTrainedModel):
# if self.is_decoder and src_enc is not None: # if self.is_decoder and src_enc is not None:
# src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
device = input_ids.device if input_ids is not None else inputs_embeds.device
# position_ids # position_ids
if position_ids is None: if position_ids is None:
position_ids = input_ids.new((slen,)).long() position_ids = torch.arange(slen, dtype=torch.long, device=device)
position_ids = torch.arange(slen, out=position_ids).unsqueeze(0) position_ids = position_ids.unsqueeze(0).expand((bs, slen))
else: else:
assert position_ids.size() == (bs, slen) # (slen, bs) assert position_ids.size() == (bs, slen) # (slen, bs)
# position_ids = position_ids.transpose(0, 1) # position_ids = position_ids.transpose(0, 1)
...@@ -471,7 +484,7 @@ class XLMModel(XLMPreTrainedModel): ...@@ -471,7 +484,7 @@ class XLMModel(XLMPreTrainedModel):
head_mask = [None] * self.n_layers head_mask = [None] * self.n_layers
# do not recompute cached elements # do not recompute cached elements
if cache is not None: if cache is not None and input_ids is not None:
_slen = slen - cache['slen'] _slen = slen - cache['slen']
input_ids = input_ids[:, -_slen:] input_ids = input_ids[:, -_slen:]
position_ids = position_ids[:, -_slen:] position_ids = position_ids[:, -_slen:]
...@@ -481,8 +494,10 @@ class XLMModel(XLMPreTrainedModel): ...@@ -481,8 +494,10 @@ class XLMModel(XLMPreTrainedModel):
attn_mask = attn_mask[:, -_slen:] attn_mask = attn_mask[:, -_slen:]
# embeddings # embeddings
tensor = self.embeddings(input_ids) if inputs_embeds is None:
tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor) inputs_embeds = self.embeddings(input_ids)
tensor = inputs_embeds + self.position_embeddings(position_ids).expand_as(inputs_embeds)
if langs is not None and self.use_lang_emb: if langs is not None and self.use_lang_emb:
tensor = tensor + self.lang_embeddings(langs) tensor = tensor + self.lang_embeddings(langs)
if token_type_ids is not None: if token_type_ids is not None:
...@@ -624,8 +639,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -624,8 +639,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
def get_output_embeddings(self): def get_output_embeddings(self):
return self.pred_layer.proj return self.pred_layer.proj
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, labels=None): lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
langs=langs, langs=langs,
...@@ -633,7 +648,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): ...@@ -633,7 +648,8 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
lengths=lengths, lengths=lengths,
cache=cache, cache=cache,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0] output = transformer_outputs[0]
outputs = self.pred_layer(output, labels) outputs = self.pred_layer(output, labels)
...@@ -685,8 +701,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -685,8 +701,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, labels=None): lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
langs=langs, langs=langs,
...@@ -694,7 +710,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel): ...@@ -694,7 +710,8 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
lengths=lengths, lengths=lengths,
cache=cache, cache=cache,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0] output = transformer_outputs[0]
logits = self.sequence_summary(output) logits = self.sequence_summary(output)
...@@ -768,8 +785,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -768,8 +785,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None): lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
langs=langs, langs=langs,
...@@ -777,7 +794,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): ...@@ -777,7 +794,8 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
lengths=lengths, lengths=lengths,
cache=cache, cache=cache,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = transformer_outputs[0] sequence_output = transformer_outputs[0]
...@@ -863,8 +881,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -863,8 +881,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None,
is_impossible=None, cls_index=None, p_mask=None): is_impossible=None, cls_index=None, p_mask=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -873,7 +891,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): ...@@ -873,7 +891,8 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
position_ids=position_ids, position_ids=position_ids,
lengths=lengths, lengths=lengths,
cache=cache, cache=cache,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0] output = transformer_outputs[0]
......
...@@ -558,6 +558,10 @@ XLNET_INPUTS_DOCSTRING = r""" ...@@ -558,6 +558,10 @@ XLNET_INPUTS_DOCSTRING = r"""
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
""" """
@add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.", @add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.",
...@@ -579,6 +583,7 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -579,6 +583,7 @@ class XLNetModel(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``) **attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples:: Examples::
...@@ -712,19 +717,29 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -712,19 +717,29 @@ class XLNetModel(XLNetPreTrainedModel):
pos_emb = pos_emb.to(next(self.parameters())) pos_emb = pos_emb.to(next(self.parameters()))
return pos_emb return pos_emb
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None): token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None):
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension # but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end # so we move here the first dimension (batch) to the end
input_ids = input_ids.transpose(0, 1).contiguous() if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_ids = input_ids.transpose(0, 1).contiguous()
qlen, bsz = input_ids.shape[0], input_ids.shape[1]
elif inputs_embeds is not None:
inputs_embeds.transpose(0, 1).contiguous()
qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None
input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None
perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
qlen, bsz = input_ids.shape[0], input_ids.shape[1]
mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0 mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
klen = mlen + qlen klen = mlen + qlen
...@@ -777,7 +792,10 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -777,7 +792,10 @@ class XLNetModel(XLNetPreTrainedModel):
non_tgt_mask = None non_tgt_mask = None
##### Word embeddings and prepare h & g hidden states ##### Word embeddings and prepare h & g hidden states
word_emb_k = self.word_embedding(input_ids) if inputs_embeds is not None:
word_emb_k = inputs_embeds
else:
word_emb_k = self.word_embedding(input_ids)
output_h = self.dropout(word_emb_k) output_h = self.dropout(word_emb_k)
if target_mapping is not None: if target_mapping is not None:
word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1) word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1)
...@@ -861,7 +879,11 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -861,7 +879,11 @@ class XLNetModel(XLNetPreTrainedModel):
hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states) hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states)
outputs = outputs + (hidden_states,) outputs = outputs + (hidden_states,)
if self.output_attentions: if self.output_attentions:
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions) if target_mapping is not None:
# when target_mapping is provided, there are 2-tuple of attentions
attentions = tuple(tuple(att_stream.permute(2, 3, 0, 1).contiguous() for att_stream in t) for t in attentions)
else:
attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
outputs = outputs + (attentions,) outputs = outputs + (attentions,)
return outputs # outputs, (new_mems), (hidden_states), (attentions) return outputs # outputs, (new_mems), (hidden_states), (attentions)
...@@ -896,6 +918,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -896,6 +918,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``) **attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples:: Examples::
...@@ -924,8 +947,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -924,8 +947,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
def get_output_embeddings(self): def get_output_embeddings(self):
return self.lm_loss return self.lm_loss
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, labels=None): token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
mems=mems, mems=mems,
...@@ -933,7 +956,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -933,7 +956,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
target_mapping=target_mapping, target_mapping=target_mapping,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
input_mask=input_mask, input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
logits = self.lm_loss(transformer_outputs[0]) logits = self.lm_loss(transformer_outputs[0])
...@@ -977,6 +1001,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -977,6 +1001,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``) **attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples:: Examples::
...@@ -998,8 +1023,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -998,8 +1023,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, labels=None): token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
mems=mems, mems=mems,
...@@ -1007,7 +1032,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1007,7 +1032,8 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
target_mapping=target_mapping, target_mapping=target_mapping,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
input_mask=input_mask, input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
output = transformer_outputs[0] output = transformer_outputs[0]
output = self.sequence_summary(output) output = self.sequence_summary(output)
...@@ -1027,6 +1053,106 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1027,6 +1053,106 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
return outputs # return (loss), logits, (mems), (hidden states), (attentions) return outputs # return (loss), logits, (mems), (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a token classification head on top (a linear layer on top of
the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
XLNET_START_DOCSTRING,
XLNET_INPUTS_DOCSTRING)
class XLNetForTokenClassification(XLNetPreTrainedModel):
r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to scores.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss.
**scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
Classification scores (before SoftMax).
**mems**: (`optional`, returned when ``config.mem_len > 0``)
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
scores = outputs[0]
"""
def __init__(self, config):
super(XLNetForTokenClassification, self).__init__(config)
self.num_labels = config.num_labels
self.transformer = XLNetModel(config)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None):
outputs = self.transformer(input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = outputs[0]
logits = self.classifier(sequence_output)
outputs = (logits,) + outputs[1:] # Keep mems, hidden states, attentions if there are in it
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = labels.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
outputs = (loss,) + outputs
return outputs # return (loss), logits, (mems), (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of @add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RACE/SWAG tasks. """, the pooled output and a softmax) e.g. for RACE/SWAG tasks. """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
...@@ -1049,6 +1175,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1049,6 +1175,10 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
Mask to nullify selected heads of the self-attention modules. Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
...@@ -1072,6 +1202,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1072,6 +1202,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``) **attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples:: Examples::
...@@ -1093,9 +1224,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1093,9 +1224,9 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, def forward(self, input_ids=None, token_type_ids=None, input_mask=None, attention_mask=None,
mems=None, perm_mask=None, target_mapping=None, mems=None, perm_mask=None, target_mapping=None,
labels=None, head_mask=None): labels=None, head_mask=None, inputs_embeds=None):
num_choices = input_ids.shape[1] num_choices = input_ids.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_input_ids = input_ids.view(-1, input_ids.size(-1))
...@@ -1106,7 +1237,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): ...@@ -1106,7 +1237,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids, transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids,
input_mask=flat_input_mask, attention_mask=flat_attention_mask, input_mask=flat_input_mask, attention_mask=flat_attention_mask,
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
head_mask=head_mask) head_mask=head_mask, inputs_embeds=inputs_embeds)
output = transformer_outputs[0] output = transformer_outputs[0]
...@@ -1157,6 +1288,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1157,6 +1288,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``) **attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples:: Examples::
...@@ -1178,8 +1310,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1178,8 +1310,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
start_positions=None, end_positions=None): start_positions=None, end_positions=None):
outputs = self.transformer(input_ids, outputs = self.transformer(input_ids,
...@@ -1189,7 +1321,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): ...@@ -1189,7 +1321,8 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
target_mapping=target_mapping, target_mapping=target_mapping,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
input_mask=input_mask, input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1270,6 +1403,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1270,6 +1403,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
**attentions**: (`optional`, returned when ``config.output_attentions=True``) **attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
When ``target_mapping is not None``, the attentions outputs are a list of 2-tuple of ``torch.FloatTensor``.
Examples:: Examples::
...@@ -1294,8 +1428,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1294,8 +1428,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
self.init_weights() self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None,
start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,): start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
transformer_outputs = self.transformer(input_ids, transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
...@@ -1304,7 +1438,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): ...@@ -1304,7 +1438,8 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
target_mapping=target_mapping, target_mapping=target_mapping,
token_type_ids=token_type_ids, token_type_ids=token_type_ids,
input_mask=input_mask, input_mask=input_mask,
head_mask=head_mask) head_mask=head_mask,
inputs_embeds=inputs_embeds)
hidden_states = transformer_outputs[0] hidden_states = transformer_outputs[0]
start_logits = self.start_logits(hidden_states, p_mask=p_mask) start_logits = self.start_logits(hidden_states, p_mask=p_mask)
......
...@@ -23,86 +23,66 @@ from torch.optim.lr_scheduler import LambdaLR ...@@ -23,86 +23,66 @@ from torch.optim.lr_scheduler import LambdaLR
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class ConstantLRSchedule(LambdaLR):
""" Constant learning rate schedule. def get_constant_schedule(optimizer, last_epoch=-1):
""" Create a schedule with a constant learning rate.
""" """
def __init__(self, optimizer, last_epoch=-1): return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
class WarmupConstantSchedule(LambdaLR): def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1):
""" Linear warmup and then constant. """ Create a schedule with a constant learning rate preceded by a warmup
Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps. period during which the learning rate increases linearly between 0 and 1.
Keeps learning rate schedule equal to 1. after warmup_steps.
""" """
def __init__(self, optimizer, warmup_steps, last_epoch=-1): def lr_lambda(current_step):
self.warmup_steps = warmup_steps if current_step < num_warmup_steps:
super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) return float(current_step) / float(max(1.0, num_warmup_steps))
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1.0, self.warmup_steps))
return 1. return 1.
return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
class WarmupLinearSchedule(LambdaLR):
""" Linear warmup and then linear decay. def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. """ Create a schedule with a learning rate that decreases linearly after
Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps. linearly increasing during a warmup period.
"""
def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
self.warmup_steps = warmup_steps
self.t_total = t_total
super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1, self.warmup_steps))
return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
class WarmupCosineSchedule(LambdaLR):
""" Linear warmup and then cosine decay.
Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
""" """
def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1): def lr_lambda(current_step):
self.warmup_steps = warmup_steps if current_step < num_warmup_steps:
self.t_total = t_total return float(current_step) / float(max(1, num_warmup_steps))
self.cycles = cycles return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1.0, self.warmup_steps)) def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1):
# progress after warmup """ Create a schedule with a learning rate that decreases following the
progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps)) values of the cosine function between 0 and `pi * cycles` after a warmup
return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) period during which it increases linearly between 0 and 1.
class WarmupCosineWithHardRestartsSchedule(LambdaLR):
""" Linear warmup and then cosine cycles with hard restarts.
Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
learning rate (with hard restarts).
""" """
def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1): def lr_lambda(current_step):
self.warmup_steps = warmup_steps if current_step < num_warmup_steps:
self.t_total = t_total return float(current_step) / float(max(1, num_warmup_steps))
self.cycles = cycles progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress)))
def lr_lambda(self, step): return LambdaLR(optimizer, lr_lambda, last_epoch)
if step < self.warmup_steps:
return float(step) / float(max(1, self.warmup_steps))
# progress after warmup
progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
if progress >= 1.0:
return 0.0
return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1):
""" Create a schedule with a learning rate that decreases following the
values of the cosine function with several hard restarts, after a warmup
period during which it increases linearly between 0 and 1.
"""
def lr_lambda(current_step):
if current_step < num_warmup_steps:
return float(current_step) / float(max(1, num_warmup_steps))
progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
if progress >= 1.:
return 0.
return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.))))
return LambdaLR(optimizer, lr_lambda, last_epoch)
class AdamW(Optimizer): class AdamW(Optimizer):
""" Implements Adam algorithm with weight decay fix. """ Implements Adam algorithm with weight decay fix.
......
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to optimization (weight updates)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
"""Applys a warmup schedule on a given learning rate decay schedule."""
def __init__(
self,
initial_learning_rate,
decay_schedule_fn,
warmup_steps,
power=1.0,
name=None):
super(WarmUp, self).__init__()
self.initial_learning_rate = initial_learning_rate
self.warmup_steps = warmup_steps
self.power = power
self.decay_schedule_fn = decay_schedule_fn
self.name = name
def __call__(self, step):
with tf.name_scope(self.name or 'WarmUp') as name:
# Implements polynomial warmup. i.e., if global_step < warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
global_step_float = tf.cast(step, tf.float32)
warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
warmup_percent_done = global_step_float / warmup_steps_float
warmup_learning_rate = (
self.initial_learning_rate *
tf.math.pow(warmup_percent_done, self.power))
return tf.cond(global_step_float < warmup_steps_float,
lambda: warmup_learning_rate,
lambda: self.decay_schedule_fn(step),
name=name)
def get_config(self):
return {
'initial_learning_rate': self.initial_learning_rate,
'decay_schedule_fn': self.decay_schedule_fn,
'warmup_steps': self.warmup_steps,
'power': self.power,
'name': self.name
}
def create_optimizer(init_lr, num_train_steps, num_warmup_steps):
"""Creates an optimizer with learning rate schedule."""
# Implements linear decay of the learning rate.
learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=init_lr,
decay_steps=num_train_steps,
end_learning_rate=0.0)
if num_warmup_steps:
learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
decay_schedule_fn=learning_rate_fn,
warmup_steps=num_warmup_steps)
optimizer = AdamWeightDecay(
learning_rate=learning_rate_fn,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=['layer_norm', 'bias'])
return optimizer
class AdamWeightDecay(tf.keras.optimizers.Adam):
"""Adam enables L2 weight decay and clip_by_global_norm on gradients.
Just adding the square of the weights to the loss function is *not* the
correct way of using L2 regularization/weight decay with Adam, since that will
interact with the m and v parameters in strange ways.
Instead we want ot decay the weights in a manner that doesn't interact with
the m/v parameters. This is equivalent to adding the square of the weights to
the loss with plain (non-momentum) SGD.
"""
def __init__(self,
learning_rate=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-7,
amsgrad=False,
weight_decay_rate=0.0,
include_in_weight_decay=None,
exclude_from_weight_decay=None,
name='AdamWeightDecay',
**kwargs):
super(AdamWeightDecay, self).__init__(
learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
self.weight_decay_rate = weight_decay_rate
self._include_in_weight_decay = include_in_weight_decay
self._exclude_from_weight_decay = exclude_from_weight_decay
@classmethod
def from_config(cls, config):
"""Creates an optimizer from its config with WarmUp custom object."""
custom_objects = {'WarmUp': WarmUp}
return super(AdamWeightDecay, cls).from_config(
config, custom_objects=custom_objects)
def _prepare_local(self, var_device, var_dtype, apply_state):
super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
apply_state)
apply_state['weight_decay_rate'] = tf.constant(
self.weight_decay_rate, name='adam_weight_decay_rate')
def _decay_weights_op(self, var, learning_rate, apply_state):
do_decay = self._do_use_weight_decay(var.name)
if do_decay:
return var.assign_sub(
learning_rate * var *
apply_state['weight_decay_rate'],
use_locking=self._use_locking)
return tf.no_op()
def apply_gradients(self, grads_and_vars, clip_norm, name=None):
grads, tvars = list(zip(*grads_and_vars))
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
return super(AdamWeightDecay, self).apply_gradients(zip(grads, tvars))
def _get_lr(self, var_device, var_dtype, apply_state):
"""Retrieves the learning rate with the given state."""
if apply_state is None:
return self._decayed_lr_t[var_dtype], {}
apply_state = apply_state or {}
coefficients = apply_state.get((var_device, var_dtype))
if coefficients is None:
coefficients = self._fallback_apply_state(var_device, var_dtype)
apply_state[(var_device, var_dtype)] = coefficients
return coefficients['lr_t'], dict(apply_state=apply_state)
def _resource_apply_dense(self, grad, var, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
return super(AdamWeightDecay, self)._resource_apply_dense(
grad, var, **kwargs)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
return super(AdamWeightDecay, self)._resource_apply_sparse(
grad, var, indices, **kwargs)
def get_config(self):
config = super(AdamWeightDecay, self).get_config()
config.update({
'weight_decay_rate': self.weight_decay_rate,
})
return config
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if self.weight_decay_rate == 0:
return False
if self._include_in_weight_decay:
for r in self._include_in_weight_decay:
if re.search(r, param_name) is not None:
return True
if self._exclude_from_weight_decay:
for r in self._exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
## Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
class GradientAccumulator(object):
"""Distribution strategies-aware gradient accumulation utility."""
def __init__(self):
"""Initializes the accumulator."""
self._gradients = []
self._accum_steps = tf.Variable(
initial_value=0,
dtype=tf.int64,
trainable=False,
aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
@property
def step(self):
"""Number of accumulated steps."""
return self._accum_steps.value()
@property
def gradients(self):
"""The accumulated gradients."""
return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
def __call__(self, gradients):
"""Accumulates :obj:`gradients`."""
if not self._gradients:
self._gradients.extend([tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient for gradient in gradients])
if len(gradients) != len(self._gradients):
raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
if accum_gradient is not None:
accum_gradient.assign_add(gradient)
self._accum_steps.assign_add(1)
def reset(self):
"""Resets the accumulated gradients."""
if self._gradients:
self._accum_steps.assign(0)
for gradient in self._get_replica_gradients():
if gradient is not None:
gradient.assign(tf.zeros_like(gradient))
def _get_replica_gradients(self):
if tf.distribute.has_strategy():
# In a replica context, we want to accumulate gradients on each replica
# without synchronization, so we directly assign the value of the
# current replica.
replica_context = tf.distribute.get_replica_context()
if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
return self._gradients
return (gradient.device_map.select_for_current_replica(gradient.values, replica_context) for gradient in self._gradients)
else:
return self._gradients
# content of conftest.py
import pytest
def pytest_addoption(parser):
parser.addoption(
"--runslow", action="store_true", default=False, help="run slow tests"
)
def pytest_collection_modifyitems(config, items):
if config.getoption("--runslow"):
# --runslow given in cli: do not skip slow tests
return
skip_slow = pytest.mark.skip(reason="need --runslow option to run")
for item in items:
if "slow" in item.keywords:
item.add_marker(skip_slow)
# coding=utf-8
# Copyright 2019-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, division, print_function
import os
import six
import time
import unittest
from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
USER = "__DUMMY_TRANSFORMERS_USER__"
PASS = "__DUMMY_TRANSFORMERS_PASS__"
FILE_KEY = "Test-{}.txt".format(int(time.time()))
FILE_PATH = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
)
class HfApiCommonTest(unittest.TestCase):
_api = HfApi(endpoint="https://moon-staging.huggingface.co")
class HfApiLoginTest(HfApiCommonTest):
def test_login_invalid(self):
with self.assertRaises(HTTPError):
self._api.login(username=USER, password="fake")
def test_login_valid(self):
token = self._api.login(username=USER, password=PASS)
self.assertIsInstance(token, six.string_types)
class HfApiEndpointsTest(HfApiCommonTest):
@classmethod
def setUpClass(cls):
"""
Share this valid token in all tests below.
"""
cls._token = cls._api.login(username=USER, password=PASS)
def test_whoami(self):
user = self._api.whoami(token=self._token)
self.assertEqual(user, USER)
def test_presign(self):
urls = self._api.presign(token=self._token, filename=FILE_KEY)
self.assertIsInstance(urls, PresignedUrl)
self.assertEqual(urls.type, "text/plain")
def test_presign_and_upload(self):
access_url = self._api.presign_and_upload(
token=self._token, filename=FILE_KEY, filepath=FILE_PATH
)
self.assertIsInstance(access_url, six.string_types)
def test_list_objs(self):
objs = self._api.list_objs(token=self._token)
self.assertIsInstance(objs, list)
if len(objs) > 0:
o = objs[-1]
self.assertIsInstance(o, S3Obj)
class HfFolderTest(unittest.TestCase):
def test_token_workflow(self):
"""
Test the whole token save/get/delete workflow,
with the desired behavior with respect to non-existent tokens.
"""
token = "token-{}".format(int(time.time()))
HfFolder.save_token(token)
self.assertEqual(
HfFolder.get_token(),
token
)
HfFolder.delete_token()
HfFolder.delete_token()
# ^^ not an error, we test that the
# second call does not fail.
self.assertEqual(
HfFolder.get_token(),
None
)
if __name__ == "__main__":
unittest.main()
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import shutil
from transformers import is_torch_available
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
if is_torch_available():
from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM,
AlbertForSequenceClassification, AlbertForQuestionAnswering,
)
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
@require_torch
class AlbertModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (AlbertModel, AlbertForMaskedLM) if is_torch_available() else ()
class AlbertModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_input_mask=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
embedding_size=16,
hidden_size=36,
num_hidden_layers=6,
num_hidden_groups=6,
num_attention_heads=6,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_input_mask = use_input_mask
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
self.num_hidden_groups = num_hidden_groups
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
input_mask = None
if self.use_input_mask:
input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = AlbertConfig(
vocab_size_or_config_json_file=self.vocab_size,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
intermediate_size=self.intermediate_size,
hidden_act=self.hidden_act,
hidden_dropout_prob=self.hidden_dropout_prob,
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
max_position_embeddings=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range,
num_hidden_groups=self.num_hidden_groups)
return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def check_loss_output(self, result):
self.parent.assertListEqual(
list(result["loss"].size()),
[])
def create_and_check_albert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = AlbertModel(config=config)
model.to(torch_device)
model.eval()
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids)
result = {
"sequence_output": sequence_output,
"pooled_output": pooled_output,
}
self.parent.assertListEqual(
list(result["sequence_output"].size()),
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = AlbertForMaskedLM(config=config)
model.to(torch_device)
model.eval()
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
result = {
"loss": loss,
"prediction_scores": prediction_scores,
}
self.parent.assertListEqual(
list(result["prediction_scores"].size()),
[self.batch_size, self.seq_length, self.vocab_size])
self.check_loss_output(result)
def create_and_check_albert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = AlbertForQuestionAnswering(config=config)
model.to(torch_device)
model.eval()
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
start_positions=sequence_labels, end_positions=sequence_labels)
result = {
"loss": loss,
"start_logits": start_logits,
"end_logits": end_logits,
}
self.parent.assertListEqual(
list(result["start_logits"].size()),
[self.batch_size, self.seq_length])
self.parent.assertListEqual(
list(result["end_logits"].size()),
[self.batch_size, self.seq_length])
self.check_loss_output(result)
def create_and_check_albert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels
model = AlbertForSequenceClassification(config)
model.to(torch_device)
model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
result = {
"loss": loss,
"logits": logits,
}
self.parent.assertListEqual(
list(result["logits"].size()),
[self.batch_size, self.num_labels])
self.check_loss_output(result)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, token_type_ids, input_mask,
sequence_labels, token_labels, choice_labels) = config_and_inputs
inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
return config, inputs_dict
def setUp(self):
self.model_tester = AlbertModelTest.AlbertModelTester(self)
self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37)
def test_config(self):
self.config_tester.run_common_tests()
def test_albert_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_model(*config_and_inputs)
def test_for_masked_lm(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_masked_lm(*config_and_inputs)
def test_for_question_answering(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_question_answering(*config_and_inputs)
def test_for_sequence_classification(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_albert_for_sequence_classification(*config_and_inputs)
@slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/"
for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = AlbertModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__":
unittest.main()
...@@ -18,11 +18,12 @@ from __future__ import print_function ...@@ -18,11 +18,12 @@ from __future__ import print_function
import unittest import unittest
import shutil import shutil
import pytest
import logging import logging
from transformers import is_torch_available from transformers import is_torch_available
from .utils import require_torch, slow
if is_torch_available(): if is_torch_available():
from transformers import (AutoConfig, BertConfig, from transformers import (AutoConfig, BertConfig,
AutoModel, BertModel, AutoModel, BertModel,
...@@ -33,12 +34,11 @@ if is_torch_available(): ...@@ -33,12 +34,11 @@ if is_torch_available():
from .modeling_common_test import (CommonTestCases, ids_tensor) from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester from .configuration_common_test import ConfigTester
else:
pytestmark = pytest.mark.skip("Require Torch")
@require_torch
class AutoModelTest(unittest.TestCase): class AutoModelTest(unittest.TestCase):
@pytest.mark.slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
...@@ -53,7 +53,7 @@ class AutoModelTest(unittest.TestCase): ...@@ -53,7 +53,7 @@ class AutoModelTest(unittest.TestCase):
for value in loading_info.values(): for value in loading_info.values():
self.assertEqual(len(value), 0) self.assertEqual(len(value), 0)
@pytest.mark.slow @slow
def test_lmhead_model_from_pretrained(self): def test_lmhead_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
...@@ -66,7 +66,7 @@ class AutoModelTest(unittest.TestCase): ...@@ -66,7 +66,7 @@ class AutoModelTest(unittest.TestCase):
self.assertIsNotNone(model) self.assertIsNotNone(model)
self.assertIsInstance(model, BertForMaskedLM) self.assertIsInstance(model, BertForMaskedLM)
@pytest.mark.slow @slow
def test_sequence_classification_model_from_pretrained(self): def test_sequence_classification_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
...@@ -79,7 +79,7 @@ class AutoModelTest(unittest.TestCase): ...@@ -79,7 +79,7 @@ class AutoModelTest(unittest.TestCase):
self.assertIsNotNone(model) self.assertIsNotNone(model)
self.assertIsInstance(model, BertForSequenceClassification) self.assertIsInstance(model, BertForSequenceClassification)
@pytest.mark.slow @slow
def test_question_answering_model_from_pretrained(self): def test_question_answering_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......
...@@ -18,12 +18,12 @@ from __future__ import print_function ...@@ -18,12 +18,12 @@ from __future__ import print_function
import unittest import unittest
import shutil import shutil
import pytest
from transformers import is_torch_available from transformers import is_torch_available
from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor) from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
from .configuration_common_test import ConfigTester from .configuration_common_test import ConfigTester
from .utils import require_torch, slow, torch_device
if is_torch_available(): if is_torch_available():
from transformers import (BertConfig, BertModel, BertForMaskedLM, from transformers import (BertConfig, BertModel, BertForMaskedLM,
...@@ -31,10 +31,9 @@ if is_torch_available(): ...@@ -31,10 +31,9 @@ if is_torch_available():
BertForQuestionAnswering, BertForSequenceClassification, BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification, BertForMultipleChoice) BertForTokenClassification, BertForMultipleChoice)
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
else:
pytestmark = pytest.mark.skip("Require Torch")
@require_torch
class BertModelTest(CommonTestCases.CommonModelTester): class BertModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction, all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
...@@ -141,6 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -141,6 +140,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertModel(config=config) model = BertModel(config=config)
model.to(torch_device)
model.eval() model.eval()
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids) sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
...@@ -157,6 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -157,6 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask): def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
model = BertModel(config) model = BertModel(config)
model.to(torch_device)
model.eval() model.eval()
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask) sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states) sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
...@@ -173,6 +174,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -173,6 +174,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForMaskedLM(config=config) model = BertForMaskedLM(config=config)
model.to(torch_device)
model.eval() model.eval()
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
result = { result = {
...@@ -186,6 +188,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -186,6 +188,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask): def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
model = BertForMaskedLM(config=config) model = BertForMaskedLM(config=config)
model.to(torch_device)
model.eval() model.eval()
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask) loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states) loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
...@@ -200,6 +203,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -200,6 +203,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForNextSentencePrediction(config=config) model = BertForNextSentencePrediction(config=config)
model.to(torch_device)
model.eval() model.eval()
loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels) loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
result = { result = {
...@@ -213,6 +217,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -213,6 +217,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForPreTraining(config=config) model = BertForPreTraining(config=config)
model.to(torch_device)
model.eval() model.eval()
loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
masked_lm_labels=token_labels, next_sentence_label=sequence_labels) masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
...@@ -231,6 +236,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -231,6 +236,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForQuestionAnswering(config=config) model = BertForQuestionAnswering(config=config)
model.to(torch_device)
model.eval() model.eval()
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
start_positions=sequence_labels, end_positions=sequence_labels) start_positions=sequence_labels, end_positions=sequence_labels)
...@@ -250,6 +256,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -250,6 +256,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = BertForSequenceClassification(config) model = BertForSequenceClassification(config)
model.to(torch_device)
model.eval() model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
result = { result = {
...@@ -264,6 +271,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -264,6 +271,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = BertForTokenClassification(config=config) model = BertForTokenClassification(config=config)
model.to(torch_device)
model.eval() model.eval()
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
result = { result = {
...@@ -278,6 +286,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -278,6 +286,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
config.num_choices = self.num_choices config.num_choices = self.num_choices
model = BertForMultipleChoice(config=config) model = BertForMultipleChoice(config=config)
model.to(torch_device)
model.eval() model.eval()
multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
...@@ -349,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -349,7 +358,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
config_and_inputs = self.model_tester.prepare_config_and_inputs() config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs) self.model_tester.create_and_check_bert_for_token_classification(*config_and_inputs)
@pytest.mark.slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
cache_dir = "/tmp/transformers_test/" cache_dir = "/tmp/transformers_test/"
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......
...@@ -27,19 +27,18 @@ import uuid ...@@ -27,19 +27,18 @@ import uuid
import unittest import unittest
import logging import logging
import pytest
from transformers import is_torch_available from transformers import is_torch_available
from .utils import require_torch, slow, torch_device
if is_torch_available(): if is_torch_available():
import torch import torch
import numpy as np import numpy as np
from transformers import (PretrainedConfig, PreTrainedModel, from transformers import (AdaptiveEmbedding, PretrainedConfig, PreTrainedModel,
BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
else:
pytestmark = pytest.mark.skip("Require Torch")
if sys.version_info[0] == 2: if sys.version_info[0] == 2:
import cPickle as pickle import cPickle as pickle
...@@ -65,6 +64,7 @@ def _config_zero_init(config): ...@@ -65,6 +64,7 @@ def _config_zero_init(config):
class CommonTestCases: class CommonTestCases:
@require_torch
class CommonModelTester(unittest.TestCase): class CommonModelTester(unittest.TestCase):
model_tester = None model_tester = None
...@@ -80,6 +80,7 @@ class CommonTestCases: ...@@ -80,6 +80,7 @@ class CommonTestCases:
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
model.to(torch_device)
model.eval() model.eval()
with torch.no_grad(): with torch.no_grad():
outputs = model(**inputs_dict) outputs = model(**inputs_dict)
...@@ -89,18 +90,17 @@ class CommonTestCases: ...@@ -89,18 +90,17 @@ class CommonTestCases:
with TemporaryDirectory() as tmpdirname: with TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname) model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(tmpdirname) model = model_class.from_pretrained(tmpdirname)
model.to(torch_device)
with torch.no_grad(): with torch.no_grad():
after_outputs = model(**inputs_dict) after_outputs = model(**inputs_dict)
# # Make sure we don't have nans # Make sure we don't have nans
out_1 = after_outputs[0].numpy() out_1 = after_outputs[0].cpu().numpy()
out_1[np.isnan(out_1)] = 0 out_2 = outputs[0].cpu().numpy()
out_1 = out_1[~np.isnan(out_1)]
out_1 = out_1 - out_2 out_2 = out_2[~np.isnan(out_2)]
amax = np.amax(out_1) max_diff = np.amax(np.abs(out_1 - out_2))
amin = np.amin(out_1) self.assertLessEqual(max_diff, 1e-5)
self.assertLessEqual(max(amax, -amin), 1e-5)
def test_initialization(self): def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -118,6 +118,7 @@ class CommonTestCases: ...@@ -118,6 +118,7 @@ class CommonTestCases:
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
model.to(torch_device)
model.eval() model.eval()
first, second = model(**inputs_dict)[0], model(**inputs_dict)[0] first, second = model(**inputs_dict)[0], model(**inputs_dict)[0]
self.assertEqual(first.ne(second).sum().item(), 0) self.assertEqual(first.ne(second).sum().item(), 0)
...@@ -134,6 +135,7 @@ class CommonTestCases: ...@@ -134,6 +135,7 @@ class CommonTestCases:
config.output_attentions = True config.output_attentions = True
config.output_hidden_states = False config.output_hidden_states = False
model = model_class(config) model = model_class(config)
model.to(torch_device)
model.eval() model.eval()
outputs = model(**inputs_dict) outputs = model(**inputs_dict)
attentions = outputs[-1] attentions = outputs[-1]
...@@ -164,6 +166,7 @@ class CommonTestCases: ...@@ -164,6 +166,7 @@ class CommonTestCases:
config.output_attentions = True config.output_attentions = True
config.output_hidden_states = True config.output_hidden_states = True
model = model_class(config) model = model_class(config)
model.to(torch_device)
model.eval() model.eval()
outputs = model(**inputs_dict) outputs = model(**inputs_dict)
self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
...@@ -203,6 +206,7 @@ class CommonTestCases: ...@@ -203,6 +206,7 @@ class CommonTestCases:
configs_no_init.torchscript = True configs_no_init.torchscript = True
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config=configs_no_init) model = model_class(config=configs_no_init)
model.to(torch_device)
model.eval() model.eval()
inputs = inputs_dict['input_ids'] # Let's keep only input_ids inputs = inputs_dict['input_ids'] # Let's keep only input_ids
...@@ -223,7 +227,10 @@ class CommonTestCases: ...@@ -223,7 +227,10 @@ class CommonTestCases:
except ValueError: except ValueError:
self.fail("Couldn't load module.") self.fail("Couldn't load module.")
model.to(torch_device)
model.eval() model.eval()
loaded_model.to(torch_device)
loaded_model.eval() loaded_model.eval()
model_params = model.parameters() model_params = model.parameters()
...@@ -249,11 +256,12 @@ class CommonTestCases: ...@@ -249,11 +256,12 @@ class CommonTestCases:
configs_no_init = _config_zero_init(config) # To be sure we have no Nan configs_no_init = _config_zero_init(config) # To be sure we have no Nan
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config=configs_no_init) model = model_class(config=configs_no_init)
model.to(torch_device)
model.eval() model.eval()
# Prepare head_mask # Prepare head_mask
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads) head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads, device=torch_device)
head_mask[0, 0] = 0 head_mask[0, 0] = 0
head_mask[-1, :-1] = 0 head_mask[-1, :-1] = 0
head_mask.requires_grad_(requires_grad=True) head_mask.requires_grad_(requires_grad=True)
...@@ -302,6 +310,7 @@ class CommonTestCases: ...@@ -302,6 +310,7 @@ class CommonTestCases:
config.output_attentions = True config.output_attentions = True
config.output_hidden_states = False config.output_hidden_states = False
model = model_class(config=config) model = model_class(config=config)
model.to(torch_device)
model.eval() model.eval()
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-1: [0]} -1: [0]}
...@@ -330,6 +339,7 @@ class CommonTestCases: ...@@ -330,6 +339,7 @@ class CommonTestCases:
config.output_attentions = True config.output_attentions = True
config.output_hidden_states = False config.output_hidden_states = False
model = model_class(config=config) model = model_class(config=config)
model.to(torch_device)
model.eval() model.eval()
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-1: [0]} -1: [0]}
...@@ -339,6 +349,7 @@ class CommonTestCases: ...@@ -339,6 +349,7 @@ class CommonTestCases:
os.makedirs(directory) os.makedirs(directory)
model.save_pretrained(directory) model.save_pretrained(directory)
model = model_class.from_pretrained(directory) model = model_class.from_pretrained(directory)
model.to(torch_device)
outputs = model(**inputs_dict) outputs = model(**inputs_dict)
attentions = outputs[-1] attentions = outputs[-1]
...@@ -366,6 +377,7 @@ class CommonTestCases: ...@@ -366,6 +377,7 @@ class CommonTestCases:
config.pruned_heads = heads_to_prune config.pruned_heads = heads_to_prune
model = model_class(config=config) model = model_class(config=config)
model.to(torch_device)
model.eval() model.eval()
outputs = model(**inputs_dict) outputs = model(**inputs_dict)
...@@ -392,6 +404,7 @@ class CommonTestCases: ...@@ -392,6 +404,7 @@ class CommonTestCases:
config.pruned_heads = heads_to_prune config.pruned_heads = heads_to_prune
model = model_class(config=config) model = model_class(config=config)
model.to(torch_device)
model.eval() model.eval()
outputs = model(**inputs_dict) outputs = model(**inputs_dict)
...@@ -408,6 +421,7 @@ class CommonTestCases: ...@@ -408,6 +421,7 @@ class CommonTestCases:
os.makedirs(directory) os.makedirs(directory)
model.save_pretrained(directory) model.save_pretrained(directory)
model = model_class.from_pretrained(directory) model = model_class.from_pretrained(directory)
model.to(torch_device)
shutil.rmtree(directory) shutil.rmtree(directory)
outputs = model(**inputs_dict) outputs = model(**inputs_dict)
...@@ -438,6 +452,7 @@ class CommonTestCases: ...@@ -438,6 +452,7 @@ class CommonTestCases:
config.output_hidden_states = True config.output_hidden_states = True
config.output_attentions = False config.output_attentions = False
model = model_class(config) model = model_class(config)
model.to(torch_device)
model.eval() model.eval()
outputs = model(**inputs_dict) outputs = model(**inputs_dict)
hidden_states = outputs[-1] hidden_states = outputs[-1]
...@@ -488,9 +503,15 @@ class CommonTestCases: ...@@ -488,9 +503,15 @@ class CommonTestCases:
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
model.get_input_embeddings() self.assertIsInstance(
model.get_input_embeddings(),
(torch.nn.Embedding, AdaptiveEmbedding)
)
model.set_input_embeddings(torch.nn.Embedding(10, 10)) model.set_input_embeddings(torch.nn.Embedding(10, 10))
model.get_output_embeddings() x = model.get_output_embeddings()
self.assertTrue(
x is None or isinstance(x, torch.nn.Linear)
)
def test_tie_model_weights(self): def test_tie_model_weights(self):
if not self.test_torchscript: if not self.test_torchscript:
...@@ -545,6 +566,20 @@ class CommonTestCases: ...@@ -545,6 +566,20 @@ class CommonTestCases:
# self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
# self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
def test_inputs_embeds(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
input_ids = inputs_dict["input_ids"]
del inputs_dict["input_ids"]
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
wte = model.get_input_embeddings()
inputs_dict["inputs_embeds"] = wte(input_ids)
outputs = model(**inputs_dict)
class GPTModelTester(CommonModelTester): class GPTModelTester(CommonModelTester):
...@@ -629,6 +664,7 @@ class CommonTestCases: ...@@ -629,6 +664,7 @@ class CommonTestCases:
def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids, def create_and_check_base_model(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids): mc_labels, lm_labels, mc_token_ids):
model = self.base_model_class(config) model = self.base_model_class(config)
model.to(torch_device)
model.eval() model.eval()
outputs = model(input_ids, position_ids, token_type_ids) outputs = model(input_ids, position_ids, token_type_ids)
...@@ -644,6 +680,7 @@ class CommonTestCases: ...@@ -644,6 +680,7 @@ class CommonTestCases:
def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids, def create_and_check_lm_head(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids): mc_labels, lm_labels, mc_token_ids):
model = self.lm_head_model_class(config) model = self.lm_head_model_class(config)
model.to(torch_device)
model.eval() model.eval()
outputs = model(input_ids, position_ids, token_type_ids, lm_labels) outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
loss, lm_logits = outputs[:2] loss, lm_logits = outputs[:2]
...@@ -660,6 +697,7 @@ class CommonTestCases: ...@@ -660,6 +697,7 @@ class CommonTestCases:
mc_labels, lm_labels, mc_token_ids): mc_labels, lm_labels, mc_token_ids):
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
model = model_class(config) model = model_class(config)
model.to(torch_device)
model.eval() model.eval()
outputs = model(input_ids) outputs = model(input_ids)
presents = outputs[-1] presents = outputs[-1]
...@@ -672,6 +710,7 @@ class CommonTestCases: ...@@ -672,6 +710,7 @@ class CommonTestCases:
def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids, def create_and_check_double_heads(self, config, input_ids, token_type_ids, position_ids,
mc_labels, lm_labels, mc_token_ids): mc_labels, lm_labels, mc_token_ids):
model = self.double_head_model_class(config) model = self.double_head_model_class(config)
model.to(torch_device)
model.eval() model.eval()
outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels, outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
token_type_ids=token_type_ids, position_ids=position_ids) token_type_ids=token_type_ids, position_ids=position_ids)
...@@ -717,7 +756,7 @@ class CommonTestCases: ...@@ -717,7 +756,7 @@ class CommonTestCases:
config_and_inputs = self.prepare_config_and_inputs() config_and_inputs = self.prepare_config_and_inputs()
self.create_and_check_presents(*config_and_inputs) self.create_and_check_presents(*config_and_inputs)
@pytest.mark.slow @slow
def run_slow_tests(self): def run_slow_tests(self):
self.create_and_check_model_from_pretrained() self.create_and_check_model_from_pretrained()
...@@ -771,7 +810,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None): ...@@ -771,7 +810,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
for _ in range(total_dims): for _ in range(total_dims):
values.append(rng.randint(0, vocab_size - 1)) values.append(rng.randint(0, vocab_size - 1))
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous() return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
def floats_tensor(shape, scale=1.0, rng=None, name=None): def floats_tensor(shape, scale=1.0, rng=None, name=None):
...@@ -787,11 +826,12 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None): ...@@ -787,11 +826,12 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
for _ in range(total_dims): for _ in range(total_dims):
values.append(rng.random() * scale) values.append(rng.random() * scale)
return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous() return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
@require_torch
class ModelUtilsTest(unittest.TestCase): class ModelUtilsTest(unittest.TestCase):
@pytest.mark.slow @slow
def test_model_from_pretrained(self): def test_model_from_pretrained(self):
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment