Commit d7395789 authored by danai-antoniou's avatar danai-antoniou
Browse files

Merge branch 'master' of...

Merge branch 'master' of https://github.com/danai-antoniou/pytorch-transformers into add-duplicate-tokens-error
parents 2e6797cc 391db836
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" A TF 2.0 Adaptive Softmax for Transformer XL model.
"""
from collections import defaultdict
import numpy as np
import tensorflow as tf
from .modeling_tf_utils import shape_list
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
keep_order=False, **kwargs):
super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
self.n_token = n_token
self.d_embed = d_embed
self.d_proj = d_proj
self.cutoffs = cutoffs + [n_token]
self.cutoff_ends = [0] + self.cutoffs
self.div_val = div_val
self.shortlist_size = self.cutoffs[0]
self.n_clusters = len(self.cutoffs) - 1
self.head_size = self.shortlist_size + self.n_clusters
self.keep_order = keep_order
self.out_layers = []
self.out_projs = []
def build(self, input_shape):
if self.n_clusters > 0:
self.cluster_weight = self.add_weight(shape=(self.n_clusters, self.d_embed),
initializer='zeros',
trainable=True,
name='cluster_weight')
self.cluster_bias = self.add_weight(shape=(self.n_clusters,),
initializer='zeros',
trainable=True,
name='cluster_bias')
if self.div_val == 1:
for i in range(len(self.cutoffs)):
if self.d_proj != self.d_embed:
weight = self.add_weight(shape=(self.d_embed, self.d_proj),
initializer='zeros',
trainable=True,
name='out_projs_._{}'.format(i))
self.out_projs.append(weight)
else:
self.out_projs.append(None)
weight = self.add_weight(shape=(self.n_token, self.d_embed,),
initializer='zeros',
trainable=True,
name='out_layers_._{}_._weight'.format(i))
bias = self.add_weight(shape=(self.n_token,),
initializer='zeros',
trainable=True,
name='out_layers_._{}_._bias'.format(i))
self.out_layers.append((weight, bias))
else:
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
d_emb_i = self.d_embed // (self.div_val ** i)
weight = self.add_weight(shape=(d_emb_i, self.d_proj),
initializer='zeros',
trainable=True,
name='out_projs_._{}'.format(i))
self.out_projs.append(weight)
weight = self.add_weight(shape=(r_idx-l_idx, d_emb_i,),
initializer='zeros',
trainable=True,
name='out_layers_._{}_._weight'.format(i))
bias = self.add_weight(shape=(r_idx-l_idx,),
initializer='zeros',
trainable=True,
name='out_layers_._{}_._bias'.format(i))
self.out_layers.append((weight, bias))
super(TFAdaptiveSoftmaxMask, self).build(input_shape)
@staticmethod
def _logit(x, W, b, proj=None):
y = x
if proj is not None:
y = tf.einsum('ibd,ed->ibe', y, proj)
return tf.einsum('ibd,nd->ibn', y, W) + b
@staticmethod
def _gather_logprob(logprob, target):
lp_size = tf.shape(logprob)
r = tf.range(lp_size[0])
idx = tf.stack([r, target], 1)
return tf.gather_nd(logprob, idx)
def call(self, inputs, return_mean=True, training=False):
hidden, target = inputs
head_logprob = 0
if self.n_clusters == 0:
softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
if target is not None:
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
out = tf.nn.log_softmax(output, axis=-1)
else:
hidden_sizes = shape_list(hidden)
out = []
loss = tf.zeros(hidden_sizes[:2], dtype=tf.float32)
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
if target is not None:
mask = (target >= l_idx) & (target < r_idx)
mask_idx = tf.where(mask)
cur_target = tf.boolean_mask(target, mask) - l_idx
if self.div_val == 1:
cur_W = self.out_layers[0][0][l_idx:r_idx]
cur_b = self.out_layers[0][1][l_idx:r_idx]
else:
cur_W = self.out_layers[i][0]
cur_b = self.out_layers[i][1]
if i == 0:
cur_W = tf.concat([cur_W, self.cluster_weight], 0)
cur_b = tf.concat([cur_b, self.cluster_bias], 0)
head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0])
head_logprob = tf.nn.log_softmax(head_logit)
out.append(head_logprob[..., :self.cutoffs[0]])
if target is not None:
cur_head_logprob = tf.boolean_mask(head_logprob, mask)
cur_logprob = self._gather_logprob(cur_head_logprob, cur_target)
else:
tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i])
tail_logprob = tf.nn.log_softmax(tail_logit)
cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster
logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob
out.append(logprob_i)
if target is not None:
cur_head_logprob = tf.boolean_mask(head_logprob, mask)
cur_tail_logprob = tf.boolean_mask(tail_logprob, mask)
cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
if target is not None:
loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(tf.shape(loss), dtype=tf.int64))
out = tf.concat(out, axis=-1)
if target is not None:
if return_mean:
loss = tf.reduce_mean(loss)
# Add the training-time loss value to the layer using `self.add_loss()`.
self.add_loss(loss)
# Log the loss as a metric (we could log arbitrary metrics,
# including different metrics for training and inference.
self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '')
return out
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TF general model utils."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging
import os
import tensorflow as tf
from .configuration_utils import PretrainedConfig
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
logger = logging.getLogger(__name__)
class TFPreTrainedModel(tf.keras.Model):
r""" Base class for all TF models.
:class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
- ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
- ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
- ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
- ``path``: a path (string) to the TensorFlow checkpoint.
- ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
"""
config_class = None
pretrained_model_archive_map = {}
load_pt_weights = lambda model, config, path: None
base_model_prefix = ""
def __init__(self, config, *inputs, **kwargs):
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
if not isinstance(config, PretrainedConfig):
raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
"To create a model from a pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
self.__class__.__name__, self.__class__.__name__
))
# Save config in model
self.config = config
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
""" Build a resized Embedding Variable from a provided token Embedding Module.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
Args:
new_num_tokens: (`optional`) int
New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
If not provided or None: return the provided token Embedding Module.
Return: ``tf.Variable``
Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
"""
# if new_num_tokens is None:
# return old_embeddings
# old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
# if old_num_tokens == new_num_tokens:
# return old_embeddings
# # Build new embeddings
# new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
# new_embeddings.to(old_embeddings.weight.device)
# # initialize all new embeddings (in particular added tokens)
# self._init_weights(new_embeddings)
# # Copy word embeddings from the previous weights
# num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
# new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
# return new_embeddings
def resize_token_embeddings(self, new_num_tokens=None):
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
Arguments:
new_num_tokens: (`optional`) int:
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model.
Return: ``tf.Variable``
Pointer to the input tokens Embeddings Module of the model
"""
raise NotImplementedError
def prune_heads(self, heads_to_prune):
""" Prunes heads of the base model.
Arguments:
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
"""
raise NotImplementedError
def save_pretrained(self, save_directory):
""" Save a model and its configuration file to a directory, so that it
can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
"""
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
# Save configuration file
self.config.save_pretrained(save_directory)
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME)
self.save_weights(output_model_file)
logger.info("Model weights saved in {}".format(output_model_file))
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with ``model.train()``
The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
It is up to you to train those weights with a downstream fine-tuning task.
The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
Parameters:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
from_pt: (`optional`) boolean, default False:
Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples::
model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)
"""
config = kwargs.pop('config', None)
cache_dir = kwargs.pop('cache_dir', None)
from_pt = kwargs.pop('from_pt', False)
force_download = kwargs.pop('force_download', False)
proxies = kwargs.pop('proxies', None)
# Load config
if config is None:
config, model_kwargs = cls.config_class.from_pretrained(
pretrained_model_name_or_path, *model_args,
cache_dir=cache_dir, return_unused_kwargs=True,
force_download=force_download,
**kwargs
)
else:
model_kwargs = kwargs
# Load model
if pretrained_model_name_or_path is not None:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path):
if os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
# Load from a TF 2.0 checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
# Load from a PyTorch checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
else:
raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
[WEIGHTS_NAME, TF2_WEIGHTS_NAME],
pretrained_model_name_or_path))
elif os.path.isfile(pretrained_model_name_or_path):
archive_file = pretrained_model_name_or_path
else:
raise EnvironmentError("Error file {} not found".format(pretrained_model_name_or_path))
# redirect to the cache, if necessary
try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError as e:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
logger.error(
"Couldn't reach server at '{}' to download pretrained weights.".format(
archive_file))
else:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(cls.pretrained_model_archive_map.keys()),
archive_file))
raise e
if resolved_archive_file == archive_file:
logger.info("loading weights file {}".format(archive_file))
else:
logger.info("loading weights file {} from cache at {}".format(
archive_file, resolved_archive_file))
else:
resolved_archive_file = None
# Instantiate model.
model = cls(config, *model_args, **model_kwargs)
if from_pt:
# Load from a PyTorch checkpoint
return cls.load_pt_weights(model, resolved_archive_file)
inputs = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
ret = model(inputs, training=False) # build the network with dummy inputs
assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
# 'by_name' allow us to do transfer learning by skipping/adding layers
# see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
model.load_weights(resolved_archive_file, by_name=True)
ret = model(inputs, training=False) # Make sure restore ops are run
return model
class TFConv1D(tf.keras.layers.Layer):
def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed
"""
super(TFConv1D, self).__init__(**kwargs)
self.nf = nf
self.nx = nx
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
"weight",
shape=[self.nx, self.nf],
initializer=get_initializer(self.initializer_range))
self.bias = self.add_weight(
"bias",
shape=[1, self.nf],
initializer=tf.zeros_initializer())
def call(self, x):
bz, sl = shape_list(x)[:2]
x = tf.reshape(x, [-1, self.nx])
x = tf.matmul(x, self.weight) + self.bias
x = tf.reshape(x, [bz, sl, self.nf])
return x
class TFSharedEmbeddings(tf.keras.layers.Layer):
"""Construct shared token embeddings.
"""
def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
super(TFSharedEmbeddings, self).__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
def build(self, input_shape):
"""Build shared word embedding layer
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
self.weight = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range))
super(TFSharedEmbeddings, self).build(input_shape)
def call(self, inputs, mode="embedding"):
"""Get token embeddings of inputs.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns:
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
linear tensor, float32 with shape [batch_size, length, vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(inputs)
elif mode == "linear":
return self._linear(inputs)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids):
"""Applies embedding based on inputs tensor."""
return tf.gather(self.weight, input_ids)
def _linear(self, inputs):
"""Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [..., hidden_size]
Returns:
float32 tensor with shape [..., vocab_size].
"""
first_dims = shape_list(inputs)[:-1]
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.weight, transpose_b=True)
return tf.reshape(logits, first_dims + [self.vocab_size])
class TFSequenceSummary(tf.keras.layers.Layer):
r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
Args of the config class:
summary_type:
- 'last' => [default] take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj: Add a projection after the vector extraction
summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
summary_first_dropout: Add a dropout before the projection and activation
summary_last_dropout: Add a dropout after the projection and activation
"""
def __init__(self, config, initializer_range=0.02, **kwargs):
super(TFSequenceSummary, self).__init__(**kwargs)
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
if self.summary_type == 'attn':
# We should use a standard multi-head attention module with absolute positional embedding for that.
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
# We can probably just use the multi-head attention module of PyTorch >=1.1.0
raise NotImplementedError
self.summary = None
if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
num_classes = config.num_labels
else:
num_classes = config.hidden_size
self.summary = tf.keras.layers.Dense(num_classes,
kernel_initializer=get_initializer(initializer_range),
name='summary')
self.activation = None
if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
self.activation = tf.keras.activations.tanh
self.first_dropout = None
if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
self.last_dropout = None
if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
def call(self, inputs, training=False):
""" hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
cls_index: [optional] position of the classification token if summary_type == 'cls_index',
shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
if summary_type == 'cls_index' and cls_index is None:
we take the last token of the sequence as classification token
"""
if not isinstance(inputs, (dict, tuple, list)):
hidden_states = inputs
cls_index = None
elif isinstance(inputs, (tuple, list)):
hidden_states = inputs[0]
cls_index = inputs[1] if len(inputs) > 1 else None
assert len(inputs) <= 2, "Too many inputs."
else:
input_ids = inputs.get('input_ids')
cls_index = inputs.get('cls_index', None)
if self.summary_type == 'last':
output = hidden_states[:, -1]
elif self.summary_type == 'first':
output = hidden_states[:, 0]
elif self.summary_type == 'mean':
output = tf.mean(hidden_states, axis=1)
elif self.summary_type == 'cls_index':
hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims]
if cls_index is None:
cls_index = tf.fill(hidden_shape[:-2], hidden_shape[-2] - 1) # A tensor full of shape [batch] or [batch, num choices] full of sequence length
cls_shape = shape_list(cls_index)
if len(cls_shape) <= len(hidden_shape) - 2:
cls_index = cls_index[..., tf.newaxis]
# else:
# cls_index = cls_index[..., tf.newaxis]
# cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
# shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2)
output = tf.squeeze(output, axis=len(hidden_shape) - 2) # shape of output: (batch, num choices, hidden_size)
elif self.summary_type == 'attn':
raise NotImplementedError
if training and self.first_dropout is not None:
output = self.first_dropout(output)
if self.summary is not None:
output = self.summary(output)
if self.activation is not None:
output = self.activation(output)
if training and self.last_dropout is not None:
output = self.last_dropout(output)
return output
def shape_list(x):
"""Deal with dynamic shape in tensorflow cleanly."""
static = x.shape.as_list()
dynamic = tf.shape(x)
return [dynamic[i] if s is None else s for i, s in enumerate(static)]
def get_initializer(initializer_range=0.02):
"""Creates a `tf.initializers.truncated_normal` with the given range.
Args:
initializer_range: float, initializer range for stddev.
Returns:
TruncatedNormal initializer with stddev = `initializer_range`.
"""
return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
# coding=utf-8
# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 XLM model.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import math
import os
import itertools
import numpy as np
import tensorflow as tf
from .configuration_xlm import XLMConfig
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer
from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__)
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5",
'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5",
'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5",
'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5",
'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5",
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5",
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5",
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5",
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5",
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5",
}
def load_xlm_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
if tf_model.config.use_lang_emb and tf_model.config.n_langs > 1:
langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
else:
langs_list = None
tf_inputs = [inputs_list, attns_list, langs_list]
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
def create_sinusoidal_embeddings(n_pos, dim, out):
position_enc = np.array([
[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
for pos in range(n_pos)
])
out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2]))
out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2]))
def gelu(x):
""" Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initialy created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
return x * cdf
def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32):
"""
Generate hidden states mask, and optionally an attention mask.
"""
bs = shape_list(lengths)[0]
if padding_mask is not None:
mask = padding_mask
else:
# assert lengths.max().item() <= slen
alen = tf.range(slen)
mask = tf.math.less(alen, lengths[:, tf.newaxis])
# attention mask is the same as mask, or triangular inferior attention (causal)
if causal:
attn_mask = tf.less_equal(tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)),
alen[tf.newaxis, :, tf.newaxis])
else:
attn_mask = mask
# sanity check
assert shape_list(mask) == [bs, slen]
assert causal is False or shape_list(attn_mask) == [bs, slen, slen]
mask = tf.cast(mask, dtype=dtype)
attn_mask = tf.cast(attn_mask, dtype=dtype)
return mask, attn_mask
class TFMultiHeadAttention(tf.keras.layers.Layer):
NEW_ID = itertools.count()
def __init__(self, n_heads, dim, config, **kwargs):
super(TFMultiHeadAttention, self).__init__(**kwargs)
self.layer_id = next(TFMultiHeadAttention.NEW_ID)
self.output_attentions = config.output_attentions
self.dim = dim
self.n_heads = n_heads
assert self.dim % self.n_heads == 0
self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='q_lin')
self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='k_lin')
self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='v_lin')
self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='out_lin')
self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
self.pruned_heads = set()
def prune_heads(self, heads):
raise NotImplementedError
def call(self, inputs, training=False):
"""
Self-attention (if kv is None) or attention over source sentence (provided by kv).
"""
input, mask, kv, cache, head_mask = inputs
# Input is (bs, qlen, dim)
# Mask is (bs, klen) (non-causal) or (bs, klen, klen)
bs, qlen, dim = shape_list(input)
if kv is None:
klen = qlen if cache is None else cache['slen'] + qlen
else:
klen = shape_list(kv)[1]
# assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
n_heads = self.n_heads
dim_per_head = self.dim // n_heads
mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
def shape(x):
""" projection """
return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
def unshape(x):
""" compute context """
return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head)
if kv is None:
k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head)
v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head)
elif cache is None or self.layer_id not in cache:
k = v = kv
k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head)
v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head)
if cache is not None:
if self.layer_id in cache:
if kv is None:
k_, v_ = cache[self.layer_id]
k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head)
v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head)
else:
k, v = cache[self.layer_id]
cache[self.layer_id] = (k, v)
q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head)
scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen)
mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen)
# scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen)
scores = scores - 1e30 * (1.0 - mask)
weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen)
weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen)
# Mask heads if we want to
if head_mask is not None:
weights = weights * head_mask
context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head)
context = unshape(context) # (bs, qlen, dim)
outputs = (self.out_lin(context),)
if self.output_attentions:
outputs = outputs + (weights,)
return outputs
class TFTransformerFFN(tf.keras.layers.Layer):
def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
super(TFTransformerFFN, self).__init__(**kwargs)
self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name='lin1')
self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name='lin2')
self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu
self.dropout = tf.keras.layers.Dropout(config.dropout)
def call(self, input, training=False):
x = self.lin1(input)
x = self.act(x)
x = self.lin2(x)
x = self.dropout(x, training=training)
return x
class TFXLMMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFXLMMainLayer, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
# encoder / decoder, output layer
self.is_encoder = config.is_encoder
self.is_decoder = not config.is_encoder
if self.is_decoder:
raise NotImplementedError("Currently XLM can only be used as an encoder")
# self.with_output = with_output
self.causal = config.causal
# dictionary / languages
self.n_langs = config.n_langs
self.use_lang_emb = config.use_lang_emb
self.n_words = config.n_words
self.eos_index = config.eos_index
self.pad_index = config.pad_index
# self.dico = dico
# self.id2lang = config.id2lang
# self.lang2id = config.lang2id
# assert len(self.dico) == self.n_words
# assert len(self.id2lang) == len(self.lang2id) == self.n_langs
# model parameters
self.dim = config.emb_dim # 512 by default
self.hidden_dim = self.dim * 4 # 2048 by default
self.n_heads = config.n_heads # 8 by default
self.n_layers = config.n_layers
assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
# embeddings
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout)
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
self.dim,
embeddings_initializer=get_initializer(config.embed_init_std),
name='position_embeddings')
if config.sinusoidal_embeddings:
raise NotImplementedError
# create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
if config.n_langs > 1 and config.use_lang_emb:
self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs,
self.dim,
embeddings_initializer=get_initializer(config.embed_init_std),
name='lang_embeddings')
self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, initializer_range=config.embed_init_std, name='embeddings') # padding_idx=self.pad_index)
self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm_emb')
# transformer layers
self.attentions = []
self.layer_norm1 = []
self.ffns = []
self.layer_norm2 = []
# if self.is_decoder:
# self.layer_norm15 = []
# self.encoder_attn = []
for i in range(self.n_layers):
self.attentions.append(TFMultiHeadAttention(self.n_heads, self.dim, config=config, name='attentions_._{}'.format(i)))
self.layer_norm1.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm1_._{}'.format(i)))
# if self.is_decoder:
# self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
# self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
self.ffns.append(TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name='ffns_._{}'.format(i)))
self.layer_norm2.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm2_._{}'.format(i)))
if hasattr(config, "pruned_heads"):
pruned_heads = config.pruned_heads.copy().items()
config.pruned_heads = {}
for layer, heads in pruned_heads:
if self.attentions[int(layer)].n_heads == config.n_heads:
self.prune_heads({int(layer): list(map(int, heads))})
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
raise NotImplementedError
def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None,
position_ids=None, lengths=None, cache=None, head_mask=None,
training=False): # removed: src_enc=None, src_len=None
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
langs = inputs[2] if len(inputs) > 2 else langs
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
position_ids = inputs[4] if len(inputs) > 4 else position_ids
lengths = inputs[5] if len(inputs) > 5 else lengths
cache = inputs[6] if len(inputs) > 6 else cache
head_mask = inputs[7] if len(inputs) > 7 else head_mask
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
langs = inputs.get('langs', langs)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
lengths = inputs.get('lengths', lengths)
cache = inputs.get('cache', cache)
head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 8, "Too many inputs."
else:
input_ids = inputs
if lengths is None:
lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1)
# mask = input_ids != self.pad_index
# check inputs
bs, slen = shape_list(input_ids)
assert shape_list(lengths)[0] == bs
# assert lengths.max().item() <= slen
# input_ids = input_ids.transpose(0, 1) # batch size as dimension 0
# assert (src_enc is None) == (src_len is None)
# if src_enc is not None:
# assert self.is_decoder
# assert src_enc.size(0) == bs
# generate masks
mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
# if self.is_decoder and src_enc is not None:
# src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
# position_ids
if position_ids is None:
position_ids = tf.expand_dims(tf.range(slen), axis=0)
else:
assert shape_list(position_ids) == [bs, slen] # (slen, bs)
# position_ids = position_ids.transpose(0, 1)
# langs
if langs is not None:
assert shape_list(langs) == [bs, slen] # (slen, bs)
# langs = langs.transpose(0, 1)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
if head_mask is not None:
raise NotImplementedError
else:
head_mask = [None] * self.n_layers
# do not recompute cached elements
if cache is not None:
_slen = slen - cache['slen']
input_ids = input_ids[:, -_slen:]
position_ids = position_ids[:, -_slen:]
if langs is not None:
langs = langs[:, -_slen:]
mask = mask[:, -_slen:]
attn_mask = attn_mask[:, -_slen:]
# embeddings
tensor = self.embeddings(input_ids)
tensor = tensor + self.position_embeddings(position_ids)
if langs is not None and self.use_lang_emb:
tensor = tensor + self.lang_embeddings(langs)
if token_type_ids is not None:
tensor = tensor + self.embeddings(token_type_ids)
tensor = self.layer_norm_emb(tensor)
tensor = self.dropout(tensor, training=training)
tensor = tensor * mask[..., tf.newaxis]
# transformer layers
hidden_states = ()
attentions = ()
for i in range(self.n_layers):
if self.output_hidden_states:
hidden_states = hidden_states + (tensor,)
# self attention
attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training)
attn = attn_outputs[0]
if self.output_attentions:
attentions = attentions + (attn_outputs[1],)
attn = self.dropout(attn, training=training)
tensor = tensor + attn
tensor = self.layer_norm1[i](tensor)
# encoder attention (for decoder only)
# if self.is_decoder and src_enc is not None:
# attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
# attn = F.dropout(attn, p=self.dropout, training=self.training)
# tensor = tensor + attn
# tensor = self.layer_norm15[i](tensor)
# FFN
tensor = tensor + self.ffns[i](tensor)
tensor = self.layer_norm2[i](tensor)
tensor = tensor * mask[..., tf.newaxis]
# Add last hidden state
if self.output_hidden_states:
hidden_states = hidden_states + (tensor,)
# update cache length
if cache is not None:
cache['slen'] += tensor.size(1)
# move back sequence length to dimension 0
# tensor = tensor.transpose(0, 1)
outputs = (tensor,)
if self.output_hidden_states:
outputs = outputs + (hidden_states,)
if self.output_attentions:
outputs = outputs + (attentions,)
return outputs # outputs, (hidden_states), (attentions)
class TFXLMPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = XLMConfig
pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_xlm_pt_weights_in_tf2
base_model_prefix = "transformer"
XLM_START_DOCSTRING = r""" The XLM model was proposed in
`Cross-lingual Language Model Pretraining`_
by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
- a causal language modeling (CLM) objective (next token prediction),
- a masked language modeling (MLM) objective (Bert-like), or
- a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
Original code can be found `here`_.
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`Cross-lingual Language Model Pretraining`:
https://arxiv.org/abs/1901.07291
.. _`here`:
https://github.com/facebookresearch/XLM
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Note on the model inputs:
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
XLM_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
Indices can be obtained using :class:`transformers.XLMTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**langs**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens to be used to indicate the language of each token in the input.
Indices are languages ids which can be obtained from the language names by using two conversion mappings
provided in the configuration of the model (only provided for multilingual models).
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
**token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**lengths**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size,)``:
Length of each sentence that can be used to avoid performing attention on padding token indices.
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
Indices selected in ``[0, ..., input_ids.size(-1)]``:
**cache**:
dictionary with ``Numpy array`` or ``tf.Tensor`` that contains pre-computed
hidden-states (key and values in the attention blocks) as computed by the model
(see `cache` output below). Can be used to speed up sequential decoding.
The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class TFXLMModel(TFXLMPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMModel
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLMModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name='transformer')
def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs)
return outputs
class TFXLMPredLayer(tf.keras.layers.Layer):
"""
Prediction layer (cross_entropy or adaptive_softmax).
"""
def __init__(self, config, input_embeddings, **kwargs):
super(TFXLMPredLayer, self).__init__(**kwargs)
self.asm = config.asm
self.n_words = config.n_words
self.pad_index = config.pad_index
if config.asm is False:
self.input_embeddings = input_embeddings
else:
raise NotImplementedError
# self.proj = nn.AdaptiveLogSoftmaxWithLoss(
# in_features=dim,
# n_classes=config.n_words,
# cutoffs=config.asm_cutoffs,
# div_value=config.asm_div_value,
# head_bias=True, # default is False
# )
def build(self, input_shape):
# The output weights are the same as the input embeddings, but there is an output-only bias for each token.
self.bias = self.add_weight(shape=(self.n_words,),
initializer='zeros',
trainable=True,
name='bias')
super(TFXLMPredLayer, self).build(input_shape)
def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear")
hidden_states = hidden_states + self.bias
return hidden_states
@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMWithLMHeadModel
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLMWithLMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name='transformer')
self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
output = transformer_outputs[0]
outputs = self.pred_layer(output)
outputs = (outputs,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here
return outputs
@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForSequenceClassification
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.constant([1])[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLMForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFXLMMainLayer(config, name='transformer')
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name='sequence_summary')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here
return outputs
@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
**end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
start_scores, end_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name='transformer')
self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.init_std),
name='qa_outputs')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
sequence_output = transformer_outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # start_logits, end_logits, (hidden_states), (attentions)
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 XLNet model.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import math
import os
import sys
from io import open
import numpy as np
import tensorflow as tf
from .configuration_xlnet import XLNetConfig
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer
from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__)
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5",
'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5",
}
def load_xlnet_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False) # build the network
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
def gelu(x):
""" Implementation of the gelu activation function.
XLNet is using OpenAI GPT's gelu
Also see https://arxiv.org/abs/1606.08415
"""
cdf = 0.5 * (1.0 + tf.tanh(
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
def swish(x):
return x * tf.sigmoid(x)
ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
"relu": tf.keras.activations.relu,
"swish": tf.keras.layers.Activation(swish)}
class TFXLNetRelativeAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFXLNetRelativeAttention, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
if config.d_model % config.n_head != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.d_model, config.n_head))
self.n_head = config.n_head
self.d_head = config.d_head
self.d_model = config.d_model
self.scale = 1 / (config.d_head ** 0.5)
self.initializer_range = config.initializer_range
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
self.dropout = tf.keras.layers.Dropout(config.dropout)
def build(self, input_shape):
initializer = get_initializer(self.initializer_range)
self.q = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='q')
self.k = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='k')
self.v = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='v')
self.o = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='o')
self.r = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='r')
self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True, name='r_r_bias')
self.r_s_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True, name='r_s_bias')
self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True, name='r_w_bias')
self.seg_embed = self.add_weight(shape=(2, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='seg_embed')
super(TFXLNetRelativeAttention, self).build(input_shape)
def prune_heads(self, heads):
raise NotImplementedError
@staticmethod
def rel_shift(x, klen=-1):
"""perform relative shift to form the relative attention score."""
x_size = shape_list(x)
x = tf.reshape(x, (x_size[1], x_size[0], x_size[2], x_size[3]))
x = x[1:, ...]
x = tf.reshape(x, (x_size[0], x_size[1] - 1, x_size[2], x_size[3]))
x = x[:, 0:klen, :, :]
# x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
return x
def rel_attn_core(self, inputs, training=False):
"""Core relative positional attention operations."""
q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask = inputs
# content based attention score
ac = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_w_bias, k_head_h)
# position based attention score
bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
bd = self.rel_shift(bd, klen=ac.shape[1])
# segment based attention score
if seg_mat is None:
ef = 0
else:
ef = tf.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed)
ef = tf.einsum('ijbs,ibns->ijbn', seg_mat, ef)
# merge attention scores and perform masking
attn_score = (ac + bd + ef) * self.scale
if attn_mask is not None:
# attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
if attn_mask.dtype == tf.float16:
attn_score = attn_score - 65500 * attn_mask
else:
attn_score = attn_score - 1e30 * attn_mask
# attention probability
attn_prob = tf.nn.softmax(attn_score, axis=1)
attn_prob = self.dropout(attn_prob, training=training)
# Mask heads if we want to
if head_mask is not None:
attn_prob = attn_prob * head_mask
# attention output
attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h)
if self.output_attentions:
return attn_vec, attn_prob
return attn_vec
def post_attention(self, inputs, residual=True, training=False):
"""Post-attention processing."""
# post-attention projection (back to `d_model`)
h, attn_vec = inputs
attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, self.o)
attn_out = self.dropout(attn_out, training=training)
if residual:
attn_out = attn_out + h
output = self.layer_norm(attn_out)
return output
def call(self, inputs, training=False):
(h, g, attn_mask_h, attn_mask_g,
r, seg_mat, mems, target_mapping, head_mask) = inputs
if g is not None:
###### Two-stream attention with relative positional encoding.
# content based attention score
if mems is not None and mems.shape.ndims > 1:
cat = tf.concat([mems, h], axis=0)
else:
cat = h
# content-based key head
k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k)
# content-based value head
v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v)
# position-based key head
k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r)
##### h-stream
# content-stream query head
q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q)
# core attention ops
attn_vec_h = self.rel_attn_core(
[q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask],
training=training)
if self.output_attentions:
attn_vec_h, attn_prob_h = attn_vec_h
# post processing
output_h = self.post_attention([h, attn_vec_h], training=training)
##### g-stream
# query-stream query head
q_head_g = tf.einsum('ibh,hnd->ibnd', g, self.q)
# core attention ops
if target_mapping is not None:
q_head_g = tf.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
attn_vec_g = self.rel_attn_core(
[q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask],
training=training)
if self.output_attentions:
attn_vec_g, attn_prob_g = attn_vec_g
attn_vec_g = tf.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping)
else:
attn_vec_g = self.rel_attn_core(
[q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask],
training=training)
if self.output_attentions:
attn_vec_g, attn_prob_g = attn_vec_g
# post processing
output_g = self.post_attention([g, attn_vec_g], training=training)
if self.output_attentions:
attn_prob = attn_prob_h, attn_prob_g
else:
###### Multi-head attention with relative positional encoding
if mems is not None and mems.shape.ndims > 1:
cat = tf.concat([mems, h], axis=0)
else:
cat = h
# content heads
q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q)
k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k)
v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v)
# positional heads
k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r)
# core attention ops
attn_vec = self.rel_attn_core(
[q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask],
training=training)
if self.output_attentions:
attn_vec, attn_prob = attn_vec
# post processing
output_h = self.post_attention([h, attn_vec], training=training)
output_g = None
outputs = (output_h, output_g)
if self.output_attentions:
outputs = outputs + (attn_prob,)
return outputs
class TFXLNetFeedForward(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFXLNetFeedForward, self).__init__(**kwargs)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
self.layer_1 = tf.keras.layers.Dense(config.d_inner,
kernel_initializer=get_initializer(config.initializer_range),
name='layer_1')
self.layer_2 = tf.keras.layers.Dense(config.d_model,
kernel_initializer=get_initializer(config.initializer_range),
name='layer_2')
self.dropout = tf.keras.layers.Dropout(config.dropout)
if isinstance(config.ff_activation, str) or \
(sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
self.activation_function = ACT2FN[config.ff_activation]
else:
self.activation_function = config.ff_activation
def call(self, inp, training=False):
output = inp
output = self.layer_1(output)
output = self.activation_function(output)
output = self.dropout(output, training=training)
output = self.layer_2(output)
output = self.dropout(output, training=training)
output = self.layer_norm(output + inp)
return output
class TFXLNetLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFXLNetLayer, self).__init__(**kwargs)
self.rel_attn = TFXLNetRelativeAttention(config, name='rel_attn')
self.ff = TFXLNetFeedForward(config, name='ff')
self.dropout = tf.keras.layers.Dropout(config.dropout)
def call(self, inputs, training=False):
outputs = self.rel_attn(inputs, training=training)
output_h, output_g = outputs[:2]
if output_g is not None:
output_g = self.ff(output_g, training=training)
output_h = self.ff(output_h, training=training)
outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there
return outputs
class TFXLNetLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super(TFXLNetLMHead, self).__init__(**kwargs)
self.vocab_size = config.vocab_size
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.input_embeddings = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,),
initializer='zeros',
trainable=True,
name='bias')
super(TFXLNetLMHead, self).build(input_shape)
def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear")
hidden_states = hidden_states + self.bias
return hidden_states
class TFXLNetMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFXLNetMainLayer, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.mem_len = config.mem_len
self.reuse_len = config.reuse_len
self.d_model = config.d_model
self.same_length = config.same_length
self.attn_type = config.attn_type
self.bi_data = config.bi_data
self.clamp_len = config.clamp_len
self.n_layer = config.n_layer
self.use_bfloat16 = config.use_bfloat16
self.initializer_range = config.initializer_range
self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
self.dropout = tf.keras.layers.Dropout(config.dropout)
def build(self, input_shape):
initializer = get_initializer(self.initializer_range)
self.mask_emb = self.add_weight(shape=(1, 1, self.d_model),
initializer=initializer,
trainable=True, name='mask_emb')
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
def create_mask(self, qlen, mlen, dtype=tf.float32):
"""
Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
Args:
qlen: TODO Lysandre didn't fill
mlen: TODO Lysandre didn't fill
::
same_length=False: same_length=True:
<mlen > < qlen > <mlen > < qlen >
^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1]
[0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1]
qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1]
[0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1]
v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0]
"""
attn_mask = tf.ones([qlen, qlen], dtype=dtype)
mask_u = tf.matrix_band_part(attn_mask, 0, -1)
mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype)
ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
if self.same_length:
mask_l = tf.matrix_band_part(attn_mask, -1, 0)
ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
return ret
def cache_mem(self, curr_out, prev_mem):
"""cache hidden states into memory."""
if self.mem_len is None or self.mem_len == 0:
return None
else:
if self.reuse_len is not None and self.reuse_len > 0:
curr_out = curr_out[:self.reuse_len]
if prev_mem is None:
new_mem = curr_out[-self.mem_len:]
else:
new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len:]
return tf.stop_gradient(new_mem)
@staticmethod
def positional_embedding(pos_seq, inv_freq, bsz=None):
sinusoid_inp = tf.einsum('i,d->id', pos_seq, inv_freq)
pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1)
pos_emb = pos_emb[:, None, :]
if bsz is not None:
pos_emb = tf.tile(pos_emb, [1, bsz, 1])
return pos_emb
def relative_positional_encoding(self, qlen, klen, bsz=None, dtype=None):
"""create relative positional encoding."""
freq_seq = tf.range(0, self.d_model, 2.0)
if dtype is not None and dtype != tf.float32:
freq_seq = tf.cast(freq_seq, dtype=dtype)
inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
if self.attn_type == 'bi':
# beg, end = klen - 1, -qlen
beg, end = klen, -qlen
elif self.attn_type == 'uni':
# beg, end = klen - 1, -1
beg, end = klen, -1
else:
raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
if self.bi_data:
fwd_pos_seq = tf.range(beg, end, -1.0)
bwd_pos_seq = tf.range(-beg, -end, 1.0)
if dtype is not None and dtype != tf.float32:
fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype)
bwd_pos_seq = tf.cast(bwd_pos_seq, dtype=dtype)
if self.clamp_len > 0:
fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -self.clamp_len, self.clamp_len)
if bsz is not None:
# With bi_data, the batch size should be divisible by 2.
assert bsz%2 == 0
fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
else:
fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
pos_emb = tf.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
else:
fwd_pos_seq = tf.range(beg, end, -1.0)
if dtype is not None and dtype != tf.float32:
fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype)
if self.clamp_len > 0:
fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -clamp_len, clamp_len)
pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
return pos_emb
def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, training=False):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
mems = inputs[2] if len(inputs) > 2 else mems
perm_mask = inputs[3] if len(inputs) > 3 else perm_mask
target_mapping = inputs[4] if len(inputs) > 4 else target_mapping
token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids
input_mask = inputs[6] if len(inputs) > 6 else input_mask
head_mask = inputs[7] if len(inputs) > 7 else head_mask
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
mems = inputs.get('mems', mems)
perm_mask = inputs.get('perm_mask', perm_mask)
target_mapping = inputs.get('target_mapping', target_mapping)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
input_mask = inputs.get('input_mask', input_mask)
head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 8, "Too many inputs."
else:
input_ids = inputs
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end
input_ids = tf.transpose(input_ids, perm=(1, 0))
token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None
input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None
attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None
perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None
target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None
qlen, bsz = shape_list(input_ids)[:2]
mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0
klen = mlen + qlen
dtype_float = tf.bfloat16 if self.use_bfloat16 else tf.float32
##### Attention mask
# causal attention mask
if self.attn_type == 'uni':
attn_mask = self.create_mask(qlen, mlen)
attn_mask = attn_mask[:, :, None, None]
elif self.attn_type == 'bi':
attn_mask = None
else:
raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
# data mask: input mask & perm mask
assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
"or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
if input_mask is None and attention_mask is not None:
input_mask = 1.0 - attention_mask
if input_mask is not None and perm_mask is not None:
data_mask = input_mask[None] + perm_mask
elif input_mask is not None and perm_mask is None:
data_mask = input_mask[None]
elif input_mask is None and perm_mask is not None:
data_mask = perm_mask
else:
data_mask = None
if data_mask is not None:
# all mems can be attended to
mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz],
dtype=dtype_float)
data_mask = tf.concat([mems_mask, data_mask], axis=1)
if attn_mask is None:
attn_mask = data_mask[:, :, :, None]
else:
attn_mask += data_mask[:, :, :, None]
if attn_mask is not None:
attn_mask = tf.cast(attn_mask > 0, dtype=dtype_float)
if attn_mask is not None:
non_tgt_mask = -tf.eye(qlen, dtype=dtype_float)
non_tgt_mask = tf.concat([tf.zeros([qlen, mlen], dtype=dtype_float), non_tgt_mask], axis=-1)
non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0, dtype=dtype_float)
else:
non_tgt_mask = None
##### Word embeddings and prepare h & g hidden states
word_emb_k = self.word_embedding(input_ids)
output_h = self.dropout(word_emb_k, training=training)
if target_mapping is not None:
word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1])
# else: # We removed the inp_q input which was same as target mapping
# inp_q_ext = inp_q[:, :, None]
# word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
output_g = self.dropout(word_emb_q, training=training)
else:
output_g = None
##### Segment embedding
if token_type_ids is not None:
# Convert `token_type_ids` to one-hot `seg_mat`
mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32)
cat_ids = tf.concat([mem_pad, token_type_ids], 0)
# `1` indicates not in the same segment [qlen x klen x bsz]
seg_mat = tf.cast(
tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])),
tf.int32)
seg_mat = tf.one_hot(seg_mat, 2, dtype=dtype_float)
else:
seg_mat = None
##### Positional encoding
pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz, dtype=dtype_float)
pos_emb = self.dropout(pos_emb, training=training)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
# and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
if head_mask is not None:
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.n_layer
new_mems = ()
if mems is None:
mems = [None] * len(self.layer)
attentions = []
hidden_states = []
for i, layer_module in enumerate(self.layer):
# cache new mems
new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
if self.output_hidden_states:
hidden_states.append((output_h, output_g) if output_g is not None else output_h)
outputs = layer_module([output_h, output_g, non_tgt_mask, attn_mask,
pos_emb, seg_mat, mems[i], target_mapping,
head_mask[i]], training=training)
output_h, output_g = outputs[:2]
if self.output_attentions:
attentions.append(outputs[2])
# Add last hidden state
if self.output_hidden_states:
hidden_states.append((output_h, output_g) if output_g is not None else output_h)
output = self.dropout(output_g if output_g is not None else output_h, training=training)
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
outputs = (tf.transpose(output, perm=(1, 0, 2)), new_mems)
if self.output_hidden_states:
if output_g is not None:
hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs)
else:
hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states)
outputs = outputs + (hidden_states,)
if self.output_attentions:
attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
outputs = outputs + (attentions,)
return outputs # outputs, new_mems, (hidden_states), (attentions)
class TFXLNetPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = XLNetConfig
pretrained_model_archive_map = TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_xlnet_pt_weights_in_tf2
base_model_prefix = "transformer"
XLNET_START_DOCSTRING = r""" The XLNet model was proposed in
`XLNet: Generalized Autoregressive Pretraining for Language Understanding`_
by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method
to learn bidirectional contexts by maximizing the expected likelihood over all permutations
of the input sequence factorization order.
The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
Do to the difficulty of training a fully auto-regressive model over various factorization order,
XLNet is pretrained using only a sub-set of the output tokens as target which are selected
with the `target_mapping` input.
To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
`target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`XLNet: Generalized Autoregressive Pretraining for Language Understanding`:
http://arxiv.org/abs/1906.08237
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Note on the model inputs:
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
XLNET_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
XLNet is a model with relative position embeddings so you can either pad the inputs on
the right or on the left.
Indices can be obtained using :class:`transformers.XLNetTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**mems**: (`optional`)
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
To activate mems you need to set up config.mem_len to a positive value which will be the max number of tokens in
the memory output by the model. E.g. `model = XLNetModel.from_pretrained('xlnet-base-case, mem_len=1024)` will
instantiate a model which can use up to 1024 tokens of memory (in addition to the input it self).
**perm_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, sequence_length)``:
Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
If None, each token attends to all the others (full bidirectional attention).
Only used during pretraining (to define factorization order) or for sequential decoding (generation).
**target_mapping**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_predict, sequence_length)``:
Mask to indicate the output tokens to use.
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
Only used during pretraining for partial prediction or for sequential decoding (generation).
**token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
the important thing is that they should be different for tokens which belong to different segments.
The model will compute relative segment differences from the given type indices:
0 if the segment id of two tokens are the same, 1 if not.
**input_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
Kept for compatibility with the original code base.
You can only uses one of `input_mask` and `attention_mask`
Mask values selected in ``[0, 1]``:
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class TFXLNetModel(TFXLNetPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**mems**:
list of ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetModel
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetModel.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLNetModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name='transformer')
def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs)
return outputs
@add_start_docstrings("""XLNet Model with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**mems**:
list of ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetLMHeadModel
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>"))[None, :] # We will predict the masked token
perm_mask = tf.zeros((1, input_ids.shape[1], input_ids.shape[1]))
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping = tf.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name='transformer')
self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
hidden_state = transformer_outputs[0]
logits = self.lm_loss(hidden_state)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # return logits, mems, (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax).
**mems**:
list of ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLNetForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFXLNetMainLayer(config, name='transformer')
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.initializer_range, name='sequence_summary')
self.logits_proj = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='logits_proj')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
output = transformer_outputs[0]
output = self.sequence_summary(output)
logits = self.logits_proj(output)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # return logits, mems, (hidden states), (attentions)
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
# the hidden-states output to compute `span start logits` and `span end logits`). """,
# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
# class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel):
class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
**end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
start_scores, end_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name='transformer')
self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='qa_outputs')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
sequence_output = transformer_outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # start_logits, end_logits, (hidden_states), (attentions)
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
# the hidden-states output to compute `span start logits` and `span end logits`). """,
# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
# class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel):
# r"""
# Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
# **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
# ``tf.Tensor`` of shape ``(batch_size, config.start_n_top)``
# Log probabilities for the top config.start_n_top start token possibilities (beam-search).
# **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
# ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
# Indices for the top config.start_n_top start token possibilities (beam-search).
# **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
# ``tf.Tensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
# Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
# **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
# ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
# Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
# **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
# ``tf.Tensor`` of shape ``(batch_size,)``
# Log probabilities for the ``is_impossible`` label of the answers.
# **mems**:
# list of ``tf.Tensor`` (one for each layer):
# that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
# if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
# See details in the docstring of the `mems` input above.
# **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
# list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
# of shape ``(batch_size, sequence_length, hidden_size)``:
# Hidden-states of the model at the output of each layer plus the initial embedding outputs.
# **attentions**: (`optional`, returned when ``config.output_attentions=True``)
# list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
# Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
# Examples::
# tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
# model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
# input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
# start_positions = tf.constant([1])
# end_positions = tf.constant([3])
# outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
# loss, start_scores, end_scores = outputs[:2]
# """
# def __init__(self, config, *inputs, **kwargs):
# super(TFXLNetForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
# self.start_n_top = config.start_n_top
# self.end_n_top = config.end_n_top
# self.transformer = TFXLNetMainLayer(config, name='transformer')
# self.start_logits = TFPoolerStartLogits(config, name='start_logits')
# self.end_logits = TFPoolerEndLogits(config, name='end_logits')
# self.answer_class = TFPoolerAnswerClass(config, name='answer_class')
# def call(self, inputs, training=False):
# transformer_outputs = self.transformer(inputs, training=training)
# hidden_states = transformer_outputs[0]
# start_logits = self.start_logits(hidden_states, p_mask=p_mask)
# outputs = transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
# if start_positions is not None and end_positions is not None:
# # If we are on multi-GPU, let's remove the dimension added by batch splitting
# for x in (start_positions, end_positions, cls_index, is_impossible):
# if x is not None and x.dim() > 1:
# x.squeeze_(-1)
# # during training, compute the end logits based on the ground truth of the start position
# end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
# loss_fct = CrossEntropyLoss()
# start_loss = loss_fct(start_logits, start_positions)
# end_loss = loss_fct(end_logits, end_positions)
# total_loss = (start_loss + end_loss) / 2
# if cls_index is not None and is_impossible is not None:
# # Predict answerability from the representation of CLS and START
# cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
# loss_fct_cls = nn.BCEWithLogitsLoss()
# cls_loss = loss_fct_cls(cls_logits, is_impossible)
# # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
# total_loss += cls_loss * 0.5
# outputs = (total_loss,) + outputs
# else:
# # during inference, compute the end logits based on beam search
# bsz, slen, hsz = hidden_states.size()
# start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
# start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
# start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
# start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
# start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
# hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
# p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
# end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
# end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
# end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
# end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
# end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
# start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) # get the representation of START as weighted sum of hidden states
# cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) # Shape (batch size,): one single `cls_logits` for each sample
# outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
# # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
# # or (if labels are provided) (total_loss,)
# return outputs
......@@ -194,7 +194,7 @@ class PositionalEmbedding(nn.Module):
class PositionwiseFF(nn.Module):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5):
super(PositionwiseFF, self).__init__()
self.d_model = d_model
......@@ -208,7 +208,7 @@ class PositionwiseFF(nn.Module):
nn.Dropout(dropout),
)
self.layer_norm = nn.LayerNorm(d_model)
self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
self.pre_lnorm = pre_lnorm
......@@ -229,102 +229,12 @@ class PositionwiseFF(nn.Module):
return output
class MultiHeadAttn(nn.Module):
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False):
super(MultiHeadAttn, self).__init__()
self.output_attentions = output_attentions
self.n_head = n_head
self.d_model = d_model
self.d_head = d_head
self.dropout = dropout
self.q_net = nn.Linear(d_model, n_head * d_head, bias=False)
self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False)
self.drop = nn.Dropout(dropout)
self.dropatt = nn.Dropout(dropatt)
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
self.layer_norm = nn.LayerNorm(d_model)
self.scale = 1 / (d_head ** 0.5)
self.pre_lnorm = pre_lnorm
if r_r_bias is None or r_w_bias is None: # Biases are not shared
self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
else:
self.r_r_bias = r_r_bias
self.r_w_bias = r_w_bias
def forward(self, h, attn_mask=None, mems=None, head_mask=None):
##### multihead attention
# [hlen x bsz x n_head x d_head]
if mems is not None:
c = torch.cat([mems, h], 0)
else:
c = h
if self.pre_lnorm:
##### layer normalization
c = self.layer_norm(c)
head_q = self.q_net(h)
head_k, head_v = torch.chunk(self.kv_net(c), 2, -1)
head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head)
head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head)
head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head)
# [qlen x klen x bsz x n_head]
attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
attn_score.mul_(self.scale)
if attn_mask is not None and torch.sum(attn_mask).item():
attn_mask = (attn_mask == 1) # Switch to bool
if attn_mask.dim() == 2:
attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
elif attn_mask.dim() == 3:
attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
# [qlen x klen x bsz x n_head]
attn_prob = F.softmax(attn_score, dim=1)
attn_prob = self.dropatt(attn_prob)
# Mask heads if we want to
if head_mask is not None:
attn_prob = attn_prob * head_mask
# [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head]
attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v))
attn_vec = attn_vec.contiguous().view(
attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
##### linear projection
attn_out = self.o_net(attn_vec)
attn_out = self.drop(attn_out)
if self.pre_lnorm:
##### residual connection
outputs = [h + attn_out]
else:
##### residual connection + layer normalization
outputs = [self.layer_norm(h + attn_out)]
if self.output_attentions:
outputs.append(attn_prob)
return outputs
class RelMultiHeadAttn(nn.Module):
class RelPartialLearnableMultiHeadAttn(nn.Module):
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
r_r_bias=None, r_w_bias=None, output_attentions=False):
super(RelMultiHeadAttn, self).__init__()
r_r_bias=None, r_w_bias=None, output_attentions=False,
layer_norm_epsilon=1e-5):
super(RelPartialLearnableMultiHeadAttn, self).__init__()
self.output_attentions = output_attentions
self.n_head = n_head
......@@ -338,7 +248,7 @@ class RelMultiHeadAttn(nn.Module):
self.dropatt = nn.Dropout(dropatt)
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
self.layer_norm = nn.LayerNorm(d_model)
self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
self.scale = 1 / (d_head ** 0.5)
......@@ -351,36 +261,9 @@ class RelMultiHeadAttn(nn.Module):
self.r_r_bias = r_r_bias
self.r_w_bias = r_w_bias
def _parallelogram_mask(self, h, w, left=False):
mask = torch.ones((h, w)).byte()
m = min(h, w)
mask[:m,:m] = torch.triu(mask[:m,:m])
mask[-m:,-m:] = torch.tril(mask[-m:,-m:])
if left:
return mask
else:
return mask.flip(0)
def _shift(self, x, qlen, klen, mask, left=False):
if qlen > 1:
zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)),
device=x.device, dtype=x.dtype)
else:
zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype)
if left:
mask = mask.flip(1)
x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1)
else:
x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1)
x = x_padded.masked_select(mask[:,:,None,None]) \
.view(qlen, klen, x.size(2), x.size(3))
return x
self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
def _rel_shift(self, x, zero_triu=False):
def _rel_shift(self, x):
zero_pad_shape = (x.size(0), 1) + x.size()[2:]
zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
x_padded = torch.cat([zero_pad, x], dim=1)
......@@ -390,21 +273,8 @@ class RelMultiHeadAttn(nn.Module):
x = x_padded[1:].view_as(x)
if zero_triu:
ones = torch.ones((x.size(0), x.size(1)))
x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None]
return x
def forward(self, w, r, attn_mask=None, mems=None):
raise NotImplementedError
class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
def __init__(self, *args, **kwargs):
super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
def forward(self, w, r, attn_mask=None, mems=None, head_mask=None):
qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
......@@ -496,148 +366,17 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
return outputs
class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
def __init__(self, *args, **kwargs):
super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None, head_mask=None):
# r_emb: [klen, n_head, d_head], used for term B
# r_w_bias: [n_head, d_head], used for term C
# r_bias: [klen, n_head], used for term D
qlen, bsz = w.size(0), w.size(1)
if mems is not None:
cat = torch.cat([mems, w], 0)
if self.pre_lnorm:
w_heads = self.qkv_net(self.layer_norm(cat))
else:
w_heads = self.qkv_net(cat)
w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
w_head_q = w_head_q[-qlen:]
else:
if self.pre_lnorm:
w_heads = self.qkv_net(self.layer_norm(w))
else:
w_heads = self.qkv_net(w)
w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
klen = w_head_k.size(0)
w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)
w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)
w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)
if klen > r_emb.size(0):
r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1)
r_emb = torch.cat([r_emb_pad, r_emb], 0)
r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1)
r_bias = torch.cat([r_bias_pad, r_bias], 0)
else:
r_emb = r_emb[-klen:]
r_bias = r_bias[-klen:]
#### compute attention score
rw_head_q = w_head_q + r_w_bias[None] # qlen x bsz x n_head x d_head
AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head
B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb)) # qlen x klen x bsz x n_head
D_ = r_bias[None, :, None] # 1 x klen x 1 x n_head
BD = self._rel_shift(B_ + D_)
# [qlen x klen x bsz x n_head]
attn_score = AC + BD
attn_score.mul_(self.scale)
#### compute attention probability
if attn_mask is not None and torch.sum(attn_mask).item():
attn_mask = (attn_mask == 1) # Switch to bool
if attn_mask.dim() == 2:
attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
elif attn_mask.dim() == 3:
attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
# [qlen x klen x bsz x n_head]
attn_prob = F.softmax(attn_score, dim=1)
attn_prob = self.dropatt(attn_prob)
if head_mask is not None:
attn_prob = attn_prob * head_mask
#### compute attention vector
attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
# [qlen x bsz x n_head x d_head]
attn_vec = attn_vec.contiguous().view(
attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
##### linear projection
attn_out = self.o_net(attn_vec)
attn_out = self.drop(attn_out)
if self.pre_lnorm:
##### residual connection
outputs = [w + attn_out]
else:
##### residual connection + layer normalization
outputs = [self.layer_norm(w + attn_out)]
if self.output_attentions:
outputs.append(attn_prob)
return outputs
class DecoderLayer(nn.Module):
def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
super(DecoderLayer, self).__init__()
self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm'))
def forward(self, dec_inp, dec_attn_mask=None, mems=None, head_mask=None):
attn_outputs = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,
mems=mems, head_mask=head_mask)
ff_output = self.pos_ff(attn_outputs[0])
outputs = [ff_output] + attn_outputs[1:]
return outputs
class RelLearnableDecoderLayer(nn.Module):
def __init__(self, n_head, d_model, d_head, d_inner, dropout,
**kwargs):
super(RelLearnableDecoderLayer, self).__init__()
self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
**kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm'))
def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None, head_mask=None):
attn_outputs = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,
attn_mask=dec_attn_mask,
mems=mems, head_mask=head_mask)
ff_output = self.pos_ff(attn_outputs[0])
outputs = [ff_output] + attn_outputs[1:]
return outputs
class RelPartialLearnableDecoderLayer(nn.Module):
def __init__(self, n_head, d_model, d_head, d_inner, dropout,
def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5,
**kwargs):
super(RelPartialLearnableDecoderLayer, self).__init__()
self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
d_head, dropout, **kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm'))
d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm'),
layer_norm_epsilon=layer_norm_epsilon)
def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
......@@ -651,7 +390,6 @@ class RelPartialLearnableDecoderLayer(nn.Module):
return outputs
class AdaptiveEmbedding(nn.Module):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
sample_softmax=False):
......@@ -775,9 +513,6 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
if hasattr(m, 'r_bias'):
self._init_bias(m.r_bias)
def set_num_special_tokens(self, num_special_tokens):
pass
TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in
`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
......@@ -796,9 +531,9 @@ TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in
https://pytorch.org/docs/stable/nn.html#module
Parameters:
config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
TRANSFO_XL_INPUTS_DOCSTRING = r"""
......@@ -807,9 +542,9 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
Indices of input sequence tokens in the vocabulary.
Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
the right or on the left.
Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mems**: (`optional`)
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
......@@ -888,45 +623,19 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
r_w_bias=None if config.untie_r else self.r_w_bias,
r_r_bias=None if config.untie_r else self.r_r_bias,
output_attentions=self.output_attentions)
)
elif config.attn_type == 1: # learnable embeddings
for i in range(config.n_layer):
self.layers.append(
RelLearnableDecoderLayer(
config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
r_w_bias=None if config.untie_r else self.r_w_bias,
r_r_bias=None if config.untie_r else self.r_r_bias,
output_attentions=self.output_attentions)
)
elif config.attn_type in [2, 3]: # absolute embeddings
for i in range(config.n_layer):
self.layers.append(
DecoderLayer(
config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
r_w_bias=None if config.untie_r else self.r_w_bias,
r_r_bias=None if config.untie_r else self.r_r_bias,
output_attentions=self.output_attentions)
output_attentions=self.output_attentions,
layer_norm_epsilon=config.layer_norm_epsilon)
)
else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints
raise NotImplementedError # Removed them to avoid maintaining dead code
self.same_length = config.same_length
self.clamp_len = config.clamp_len
if self.attn_type == 0: # default attention
self.pos_emb = PositionalEmbedding(self.d_model)
elif self.attn_type == 1: # learnable
self.r_emb = nn.Parameter(torch.FloatTensor(
self.n_layer, self.max_klen, self.n_head, self.d_head))
self.r_bias = nn.Parameter(torch.FloatTensor(
self.n_layer, self.max_klen, self.n_head))
elif self.attn_type == 2: # absolute standard
self.pos_emb = PositionalEmbedding(self.d_model)
elif self.attn_type == 3: # absolute deeper SA
self.r_emb = nn.Parameter(torch.FloatTensor(
self.n_layer, self.max_klen, self.n_head, self.d_head))
else: # learnable embeddings and absolute embeddings
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
self.init_weights()
......@@ -981,8 +690,15 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return new_mems
def _forward(self, dec_inp, mems=None, head_mask=None):
qlen, bsz = dec_inp.size()
def forward(self, input_ids, mems=None, head_mask=None):
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = input_ids.transpose(0, 1).contiguous()
if mems is None:
mems = self.init_mems(input_ids)
qlen, bsz = input_ids.size()
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
......@@ -999,7 +715,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
else:
head_mask = [None] * self.n_layer
word_emb = self.word_emb(dec_inp)
word_emb = self.word_emb(input_ids)
mlen = mems[0].size(0) if mems is not None else 0
klen = mlen + qlen
......@@ -1036,64 +752,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
core_out = layer_outputs[0]
if self.output_attentions:
attentions.append(layer_outputs[1])
elif self.attn_type == 1: # learnable
core_out = self.drop(word_emb)
for i, layer in enumerate(self.layers):
hids.append(core_out)
if self.clamp_len > 0:
r_emb = self.r_emb[i][-self.clamp_len :]
r_bias = self.r_bias[i][-self.clamp_len :]
else:
r_emb, r_bias = self.r_emb[i], self.r_bias[i]
mems_i = None if mems is None else mems[i]
layer_outputs = layer(core_out, r_emb, self.r_w_bias[i],
r_bias, dec_attn_mask=dec_attn_mask,
mems=mems_i, head_mask=head_mask[i])
core_out = layer_outputs[0]
if self.output_attentions:
attentions.append(layer_outputs[1])
elif self.attn_type == 2: # absolute
pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
dtype=word_emb.dtype)
if self.clamp_len > 0:
pos_seq.clamp_(max=self.clamp_len)
pos_emb = self.pos_emb(pos_seq)
core_out = self.drop(word_emb + pos_emb[-qlen:])
for i, layer in enumerate(self.layers):
hids.append(core_out)
mems_i = None if mems is None else mems[i]
if mems_i is not None and i == 0:
mems_i += pos_emb[:mlen]
layer_outputs = layer(core_out, dec_attn_mask=dec_attn_mask,
mems=mems_i, head_mask=head_mask[i])
core_out = layer_outputs[0]
if self.output_attentions:
attentions.append(layer_outputs[1])
elif self.attn_type == 3:
core_out = self.drop(word_emb)
for i, layer in enumerate(self.layers):
hids.append(core_out)
mems_i = None if mems is None else mems[i]
if mems_i is not None and mlen > 0:
cur_emb = self.r_emb[i][:-qlen]
cur_size = cur_emb.size(0)
if cur_size < mlen:
cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1)
cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)
else:
cur_emb = cur_emb[-mlen:]
mems_i += cur_emb.view(mlen, 1, -1)
core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)
layer_outputs = layer(core_out, dec_attn_mask=dec_attn_mask,
mems=mems_i, head_mask=head_mask[i])
core_out = layer_outputs[0]
if self.output_attentions:
attentions.append(layer_outputs[1])
else: # learnable embeddings and absolute embeddings
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
core_out = self.drop(core_out)
......@@ -1110,16 +770,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
# Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
outputs.append(attentions)
return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
def forward(self, input_ids, mems=None, head_mask=None):
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = input_ids.transpose(0, 1).contiguous()
if mems is None:
mems = self.init_mems(input_ids)
outputs = self._forward(input_ids, mems=mems, head_mask=head_mask)
return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
......
......@@ -31,7 +31,7 @@ from torch.nn import CrossEntropyLoss
from torch.nn import functional as F
from .configuration_utils import PretrainedConfig
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
logger = logging.getLogger(__name__)
......@@ -52,16 +52,16 @@ except ImportError:
class PreTrainedModel(nn.Module):
r""" Base class for all models.
:class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
:class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
- ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
- ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
- ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
- ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
- ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
- ``path``: a path (string) to the TensorFlow checkpoint.
- ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
......@@ -189,7 +189,7 @@ class PreTrainedModel(nn.Module):
def save_pretrained(self, save_directory):
""" Save a model and its configuration file to a directory, so that it
can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
"""
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
......@@ -201,8 +201,8 @@ class PreTrainedModel(nn.Module):
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
logger.info("Model weights saved in {}".format(output_model_file))
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
......@@ -220,23 +220,24 @@ class PreTrainedModel(nn.Module):
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
......@@ -256,7 +257,7 @@ class PreTrainedModel(nn.Module):
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples::
......@@ -289,103 +290,125 @@ class PreTrainedModel(nn.Module):
model_kwargs = kwargs
# Load model
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path):
if from_tf:
# Directly load from a TensorFlow checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
if pretrained_model_name_or_path is not None:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path):
if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
# Load from a TF 1.0 checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
# Load from a TF 2.0 checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
# Load from a PyTorch checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
else:
raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
[WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
pretrained_model_name_or_path))
elif os.path.isfile(pretrained_model_name_or_path):
archive_file = pretrained_model_name_or_path
else:
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
else:
if from_tf:
# Directly load from a TensorFlow checkpoint
assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
archive_file = pretrained_model_name_or_path + ".index"
# redirect to the cache, if necessary
try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError as e:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
logger.error(
"Couldn't reach server at '{}' to download pretrained weights.".format(
archive_file))
else:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(cls.pretrained_model_archive_map.keys()),
archive_file))
raise e
if resolved_archive_file == archive_file:
logger.info("loading weights file {}".format(archive_file))
else:
archive_file = pretrained_model_name_or_path
# redirect to the cache, if necessary
try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError as e:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
logger.error(
"Couldn't reach server at '{}' to download pretrained weights.".format(
archive_file))
else:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(cls.pretrained_model_archive_map.keys()),
archive_file))
raise e
if resolved_archive_file == archive_file:
logger.info("loading weights file {}".format(archive_file))
logger.info("loading weights file {} from cache at {}".format(
archive_file, resolved_archive_file))
else:
logger.info("loading weights file {} from cache at {}".format(
archive_file, resolved_archive_file))
resolved_archive_file = None
# Instantiate model.
model = cls(config, *model_args, **model_kwargs)
if state_dict is None and not from_tf:
state_dict = torch.load(resolved_archive_file, map_location='cpu')
if from_tf:
# Directly load from a TensorFlow checkpoint
return cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index'
# Convert old format to new format if needed from a PyTorch state_dict
old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
# Load from a PyTorch state_dict
missing_keys = []
unexpected_keys = []
error_msgs = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata
def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')
# Make sure we are able to load base models as well as derived models (with heads)
start_prefix = ''
model_to_load = model
if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
start_prefix = cls.base_model_prefix + '.'
if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
model_to_load = getattr(model, cls.base_model_prefix)
load(model_to_load, prefix=start_prefix)
if len(missing_keys) > 0:
logger.info("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
logger.info("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
if len(error_msgs) > 0:
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
model.__class__.__name__, "\n\t".join(error_msgs)))
if from_tf:
if resolved_archive_file.endswith('.index'):
# Load from a TensorFlow 1.X checkpoint - provided by original authors
model = cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index'
else:
# Load from our TensorFlow 2.0 checkpoints
try:
from transformers import load_tf2_checkpoint_in_pytorch_model
model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
except ImportError as e:
logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
raise e
else:
# Convert old format to new format if needed from a PyTorch state_dict
old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata
def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')
# Make sure we are able to load base models as well as derived models (with heads)
start_prefix = ''
model_to_load = model
if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
start_prefix = cls.base_model_prefix + '.'
if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
model_to_load = getattr(model, cls.base_model_prefix)
load(model_to_load, prefix=start_prefix)
if len(missing_keys) > 0:
logger.info("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
logger.info("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
if len(error_msgs) > 0:
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
model.__class__.__name__, "\n\t".join(error_msgs)))
if hasattr(model, 'tie_weights'):
model.tie_weights() # make sure word embedding weights are still tied
......@@ -478,7 +501,10 @@ class PoolerEndLogits(nn.Module):
x = self.dense_1(x).squeeze(-1)
if p_mask is not None:
x = x * (1 - p_mask) - 1e30 * p_mask
if next(self.parameters()).dtype == torch.float16:
x = x * (1 - p_mask) - 65500 * p_mask
else:
x = x * (1 - p_mask) - 1e30 * p_mask
return x
......@@ -531,7 +557,7 @@ class SQuADHead(nn.Module):
r""" A SQuAD head inspired by XLNet.
Parameters:
config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
Inputs:
**hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
......@@ -682,7 +708,7 @@ class SequenceSummary(nn.Module):
self.last_dropout = nn.Dropout(config.summary_last_dropout)
def forward(self, hidden_states, cls_index=None):
""" hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
""" hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer.
cls_index: [optional] position of the classification token if summary_type == 'cls_index',
shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
if summary_type == 'cls_index' and cls_index is None:
......
......@@ -63,7 +63,7 @@ def gelu(x):
GELU activation
https://arxiv.org/abs/1606.08415
https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
https://github.com/huggingface/pytorch-transformers/blob/master/modeling.py
https://github.com/huggingface/transformers/blob/master/modeling.py
"""
# return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
......@@ -265,9 +265,9 @@ XLM_START_DOCSTRING = r""" The XLM model was proposed in
https://github.com/facebookresearch/XLM
Parameters:
config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
XLM_INPUTS_DOCSTRING = r"""
......@@ -278,9 +278,9 @@ XLM_INPUTS_DOCSTRING = r"""
XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
Indices can be obtained using :class:`transformers.XLMTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
......@@ -337,11 +337,6 @@ class XLMModel(XLMPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads',
'hidden_dim', 'dropout', 'attention_dropout', 'asm',
'asm_cutoffs', 'asm_div_value']
def __init__(self, config): #, dico, is_encoder, with_output):
super(XLMModel, self).__init__(config)
self.output_attentions = config.output_attentions
......@@ -568,10 +563,10 @@ class XLMPredLayer(nn.Module):
"""
outputs = ()
if self.asm is False:
scores = self.proj(x).view(-1, self.n_words)
scores = self.proj(x)
outputs = (scores,) + outputs
if y is not None:
loss = F.cross_entropy(scores, y, reduction='elementwise_mean')
loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction='elementwise_mean')
outputs = (loss,) + outputs
else:
scores = self.proj.log_prob(x)
......@@ -723,6 +718,101 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels whether a question has an answer or no answer (SQuAD 2.0)
**cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
**p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss, start_scores, end_scores = outputs[:2]
"""
def __init__(self, config):
super(XLMForQuestionAnsweringSimple, self).__init__(config)
self.transformer = XLMModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None):
transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask)
sequence_output = transformer_outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,)
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs
outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here
return outputs
@add_start_docstrings("""XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class XLMForQuestionAnswering(XLMPreTrainedModel):
r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......
......@@ -488,9 +488,9 @@ XLNET_START_DOCSTRING = r""" The XLNet model was proposed in
https://pytorch.org/docs/stable/nn.html#module
Parameters:
config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
XLNET_INPUTS_DOCSTRING = r"""
......@@ -499,9 +499,9 @@ XLNET_INPUTS_DOCSTRING = r"""
Indices of input sequence tokens in the vocabulary.
XLNet is a model with relative position embeddings so you can either pad the inputs on
the right or on the left.
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
Indices can be obtained using :class:`transformers.XLNetTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
......@@ -531,8 +531,10 @@ XLNET_INPUTS_DOCSTRING = r"""
Only used during pretraining for partial prediction or for sequential decoding (generation).
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
the important thing is that they should be different for tokens which belong to different segments.
The model will compute relative segment differences from the given type indices:
0 if the segment id of two tokens are the same, 1 if not.
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
......@@ -1103,6 +1105,101 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
return outputs # return (loss), logits, mems, (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax).
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss, start_scores, end_scores = outputs[:2]
"""
def __init__(self, config):
super(XLNetForQuestionAnsweringSimple, self).__init__(config)
self.num_labels = config.num_labels
self.transformer = XLNetModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None,
start_positions=None, end_positions=None):
outputs = self.transformer(input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,) + outputs[2:]
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
......
......@@ -21,15 +21,20 @@ import shutil
import pytest
import logging
from pytorch_transformers import (AutoConfig, BertConfig,
AutoModel, BertModel,
AutoModelWithLMHead, BertForMaskedLM,
AutoModelForSequenceClassification, BertForSequenceClassification,
AutoModelForQuestionAnswering, BertForQuestionAnswering)
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from transformers import is_torch_available
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
if is_torch_available():
from transformers import (AutoConfig, BertConfig,
AutoModel, BertModel,
AutoModelWithLMHead, BertForMaskedLM,
AutoModelForSequenceClassification, BertForSequenceClassification,
AutoModelForQuestionAnswering, BertForQuestionAnswering)
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
else:
pytestmark = pytest.mark.skip("Require Torch")
class AutoModelTest(unittest.TestCase):
......
......@@ -20,21 +20,26 @@ import unittest
import shutil
import pytest
from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
BertForNextSentencePrediction, BertForPreTraining,
BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification, BertForMultipleChoice)
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from transformers import is_torch_available
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
if is_torch_available():
from transformers import (BertConfig, BertModel, BertForMaskedLM,
BertForNextSentencePrediction, BertForPreTraining,
BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification, BertForMultipleChoice)
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
else:
pytestmark = pytest.mark.skip("Require Torch")
class BertModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
BertForTokenClassification)
BertForTokenClassification) if is_torch_available() else ()
class BertModelTester(object):
......@@ -305,7 +310,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
@pytest.mark.slow
def test_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_transformers_test/"
cache_dir = "/tmp/transformers_test/"
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
model = BertModel.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
......
......@@ -25,12 +25,18 @@ import uuid
import unittest
import logging
import pytest
import torch
from transformers import is_torch_available
from pytorch_transformers import (PretrainedConfig, PreTrainedModel,
BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
if is_torch_available():
import torch
from transformers import (PretrainedConfig, PreTrainedModel,
BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
else:
pytestmark = pytest.mark.skip("Require Torch")
def _config_zero_init(config):
......@@ -62,6 +68,16 @@ class CommonTestCases:
self.assertIn(param.data.mean().item(), [0.0, 1.0],
msg="Parameter {} of model {} seems not properly initialized".format(name, model_class))
def test_determinism(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.eval()
first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
self.assertEqual(first.ne(second).sum().item(), 0)
def test_attention_outputs(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
......@@ -195,6 +211,9 @@ class CommonTestCases:
hidden_states = outputs[-2]
# Remove Nan
for t in attentions:
self.assertLess(torch.sum(torch.isnan(t)), t.numel() / 4) # Check we don't have more than 25% nans (arbitrary)
attentions = [t.masked_fill(torch.isnan(t), 0.0) for t in attentions] # remove them (the test is less complete)
self.assertIsNotNone(multihead_outputs)
self.assertEqual(len(multihead_outputs), self.model_tester.num_hidden_layers)
......@@ -602,7 +621,7 @@ class CommonTestCases:
[[], []])
def create_and_check_model_from_pretrained(self):
cache_dir = "/tmp/pytorch_transformers_test/"
cache_dir = "/tmp/transformers_test/"
for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]:
model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir)
shutil.rmtree(cache_dir)
......
......@@ -17,9 +17,15 @@ from __future__ import division
from __future__ import print_function
import unittest
import pytest
from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
from transformers import is_torch_available
if is_torch_available():
from transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
else:
pytestmark = pytest.mark.skip("Require Torch")
from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
......@@ -28,7 +34,7 @@ from .configuration_common_test import ConfigTester
class DistilBertModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (DistilBertModel, DistilBertForMaskedLM, DistilBertForQuestionAnswering,
DistilBertForSequenceClassification)
DistilBertForSequenceClassification) if is_torch_available() else None
test_pruning = True
test_torchscript = True
test_resize_embeddings = True
......@@ -205,7 +211,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
# @pytest.mark.slow
# def test_model_from_pretrained(self):
# cache_dir = "/tmp/pytorch_transformers_test/"
# cache_dir = "/tmp/transformers_test/"
# for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
# model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir)
# shutil.rmtree(cache_dir)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment