Unverified Commit d3f24dfa authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Merge branch 'master' into master

parents 4b543c30 ecc4f1bd
# coding=utf-8
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 OpenAI GPT-2 model. """
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import json
import logging
import math
import os
import sys
from io import open
import numpy as np
import tensorflow as tf
from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
TFSequenceSummary, shape_list, get_initializer)
from .configuration_gpt2 import GPT2Config
from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__)
TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
"gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
"gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
"distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}
def load_gpt2_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
def gelu(x):
"""Gaussian Error Linear Unit.
This is a smoother version of the RELU.
Original paper: https://arxiv.org/abs/1606.08415
Args:
x: float Tensor to perform activation.
Returns:
`x` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.tanh(
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
class TFAttention(tf.keras.layers.Layer):
def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
super(TFAttention, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
assert n_state % config.n_head == 0
self.n_ctx = n_ctx
self.n_head = config.n_head
self.split_size = n_state
self.scale = scale
self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
self.pruned_heads = set()
def prune_heads(self, heads):
pass
@staticmethod
def causal_attention_mask(nd, ns, dtype):
"""1's in the lower triangle, counting from the lower right corner.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
"""
i = tf.range(nd)[:,None]
j = tf.range(ns)
m = i >= j - ns + nd
return tf.cast(m, dtype)
def _attn(self, inputs, training=False):
q, k, v, attention_mask, head_mask = inputs
# q, k, v have shape [batch, heads, sequence, features]
w = tf.matmul(q, k, transpose_b=True)
if self.scale:
dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
w = w / tf.math.sqrt(dk)
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
_, _, nd, ns = shape_list(w)
b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
b = tf.reshape(b, [1, 1, nd, ns])
w = w * b - 1e4 * (1 - b)
if attention_mask is not None:
# Apply the attention mask
w = w + attention_mask
w = tf.nn.softmax(w, axis=-1)
w = self.attn_dropout(w, training=training)
# Mask heads if we want to
if head_mask is not None:
w = w * head_mask
outputs = [tf.matmul(w, v)]
if self.output_attentions:
outputs.append(w)
return outputs
def merge_heads(self, x):
x = tf.transpose(x, [0, 2, 1, 3])
x_shape = shape_list(x)
new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
return tf.reshape(x, new_x_shape)
def split_heads(self, x):
x_shape = shape_list(x)
new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
x = tf.reshape(x, new_x_shape)
return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features)
def call(self, inputs, training=False):
x, layer_past, attention_mask, head_mask = inputs
x = self.c_attn(x)
query, key, value = tf.split(x, 3, axis=2)
query = self.split_heads(query)
key = self.split_heads(key)
value = self.split_heads(value)
if layer_past is not None:
past_key, past_value = tf.unstack(layer_past, axis=1)
key = tf.concat([past_key, key], axis=-2)
value = tf.concat([past_value, value], axis=-2)
present = tf.stack([key, value], axis=1)
attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
a = attn_outputs[0]
a = self.merge_heads(a)
a = self.c_proj(a)
a = self.resid_dropout(a, training=training)
outputs = [a, present] + attn_outputs[1:]
return outputs # a, present, (attentions)
class TFMLP(tf.keras.layers.Layer):
def __init__(self, n_state, config, **kwargs):
super(TFMLP, self).__init__(**kwargs)
nx = config.n_embd
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
self.act = gelu
self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
def call(self, x, training=False):
h = self.act(self.c_fc(x))
h2 = self.c_proj(h)
h2 = self.dropout(h2, training=training)
return h2
class TFBlock(tf.keras.layers.Layer):
def __init__(self, n_ctx, config, scale=False, **kwargs):
super(TFBlock, self).__init__(**kwargs)
nx = config.n_embd
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
self.mlp = TFMLP(4 * nx, config, name='mlp')
def call(self, inputs, training=False):
x, layer_past, attention_mask, head_mask = inputs
a = self.ln_1(x)
output_attn = self.attn([a, layer_past, attention_mask, head_mask], training=training)
a = output_attn[0] # output_attn: a, present, (attentions)
x = x + a
m = self.ln_2(x)
m = self.mlp(m, training=training)
x = x + m
outputs = [x] + output_attn[1:]
return outputs # x, present, (attentions)
class TFGPT2MainLayer(tf.keras.layers.Layer):
def __init__(self, config, *inputs, **kwargs):
super(TFGPT2MainLayer, self).__init__(config, *inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions
self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size
self.n_embd = config.n_embd
self.wte = TFSharedEmbeddings(config.vocab_size,
config.hidden_size,
initializer_range=config.initializer_range,
name='wte')
self.wpe = tf.keras.layers.Embedding(config.n_positions,
config.n_embd,
embeddings_initializer=get_initializer(config.initializer_range),
name='wpe')
self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config.n_ctx,
config,
scale=True,
name='h_._{}'.format(i)) for i in range(config.n_layer)]
self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f')
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
raise NotImplementedError
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
past = inputs[1] if len(inputs) > 1 else past
attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
position_ids = inputs[4] if len(inputs) > 4 else position_ids
head_mask = inputs[5] if len(inputs) > 5 else head_mask
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
past = inputs.get('past', past)
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 6, "Too many inputs."
else:
input_ids = inputs
if past is None:
past_length = 0
past = [None] * len(self.h)
else:
past_length = shape_list(past[0][0])[-2]
if position_ids is None:
position_ids = tf.range(past_length, shape_list(input_ids)[-1] + past_length, dtype=tf.int32)[tf.newaxis, :]
if attention_mask is not None:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask = tf.cast(attention_mask, tf.float32)
attention_mask = (1.0 - attention_mask) * -10000.0
else:
attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if not head_mask is None:
raise NotImplementedError
else:
head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
input_shape = shape_list(input_ids)
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
inputs_embeds = self.wte(input_ids, mode='embedding')
position_embeds = self.wpe(position_ids)
if token_type_ids is not None:
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
token_type_embeds = self.wte(token_type_ids, mode='embedding')
else:
token_type_embeds = 0
hidden_states = inputs_embeds + position_embeds + token_type_embeds
hidden_states = self.drop(hidden_states, training=training)
output_shape = input_shape + [shape_list(hidden_states)[-1]]
presents = ()
all_attentions = []
all_hidden_states = ()
for i, (block, layer_past) in enumerate(zip(self.h, past)):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
outputs = block([hidden_states, layer_past, attention_mask, head_mask[i]], training=training)
hidden_states, present = outputs[:2]
presents = presents + (present,)
if self.output_attentions:
all_attentions.append(outputs[2])
hidden_states = self.ln_f(hidden_states)
hidden_states = tf.reshape(hidden_states, output_shape)
# Add last hidden state
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states, presents)
if self.output_hidden_states:
outputs = outputs + (all_hidden_states,)
if self.output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,)
return outputs # last hidden state, presents, (all hidden_states), (attentions)
class TFGPT2PreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = GPT2Config
pretrained_model_archive_map = TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_gpt2_pt_weights_in_tf2
base_model_prefix = "transformer"
GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
`Language Models are Unsupervised Multitask Learners`_
by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
corpus of ~40 GB of text data.
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`Language Models are Unsupervised Multitask Learners`:
https://openai.com/blog/better-language-models/
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Note on the model inputs:
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
GPT2_INPUTS_DOCSTRING = r""" Inputs:
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**past**:
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past` output below). Can be used to speed up sequential decoding.
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class TFGPT2Model(TFGPT2PreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**past**:
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2Model.from_pretrained('gpt2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFGPT2Model, self).__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name='transformer')
def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs)
return outputs
@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**prediction_scores**: `tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**past**:
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of `tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2LMHeadModel.from_pretrained('gpt2')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFGPT2LMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name='transformer')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0]
lm_logits = self.transformer.wte(hidden_states, mode="linear")
outputs = (lm_logits,) + transformer_outputs[1:]
return outputs # lm_logits, presents, (all hidden_states), (attentions)
@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
r"""
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**lm_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**mc_prediction_scores**: `tf.Tensor`` of shape ``(batch_size, num_choices)``
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
**past**:
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
that contains pre-computed hidden-states (key and values in the attention blocks).
Can be used (see `past` input) to speed up sequential decoding.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of `tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of `tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
# Add a [CLS] to the vocabulary (we should train it also!)
# This option is currently not implemented in TF 2.0
raise NotImplementedError
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
input_ids = tf.constant(encoded_choices)[None, :] # Batch size: 1, number of choices: 2
mc_token_ids = tf.constant([cls_token_location]) # Batch size: 1
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFGPT2MainLayer(config, name='transformer')
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
def call(self, inputs, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
past = inputs[1] if len(inputs) > 1 else past
attention_mask = inputs[2] if len(inputs) > 2 else attention_mask
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
position_ids = inputs[4] if len(inputs) > 4 else position_ids
head_mask = inputs[5] if len(inputs) > 5 else head_mask
mc_token_ids = inputs[6] if len(inputs) > 6 else mc_token_ids
assert len(inputs) <= 7, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
past = inputs.get('past', past)
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
assert len(inputs) <= 7, "Too many inputs."
else:
input_ids = inputs
input_shapes = shape_list(input_ids)
seq_length = input_shapes[-1]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [flat_input_ids, past, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
transformer_outputs = self.transformer(flat_inputs, training=training)
hidden_states = transformer_outputs[0]
hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
lm_logits = self.transformer.wte(hidden_states, mode="linear")
mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
mc_logits = tf.squeeze(mc_logits, axis=-1)
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
return outputs # lm logits, mc logits, presents, (all hidden_states), (attentions)
# coding=utf-8
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 OpenAI GPT model."""
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import json
import logging
import math
import os
import sys
from io import open
import numpy as np
import tensorflow as tf
from .modeling_tf_utils import (TFPreTrainedModel, TFConv1D, TFSharedEmbeddings,
TFSequenceSummary, shape_list, get_initializer)
from .configuration_openai import OpenAIGPTConfig
from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__)
TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-tf_model.h5"}
def load_openai_gpt_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
def gelu(x):
"""Gaussian Error Linear Unit.
This is a smoother version of the RELU.
Original paper: https://arxiv.org/abs/1606.08415
Args:
x: float Tensor to perform activation.
Returns:
`x` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.tanh(
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
def swish(x):
return x * tf.math.sigmoid(x)
ACT_FNS = {"gelu": tf.keras.layers.Activation(gelu),
"relu": tf.keras.activations.relu,
"swish": tf.keras.layers.Activation(swish)}
class TFAttention(tf.keras.layers.Layer):
def __init__(self, nx, n_ctx, config, scale=False, **kwargs):
super(TFAttention, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
n_state = nx # in Attention: n_state=768 (nx=n_embd)
# [switch nx => n_state from Block to Attention to keep identical to TF implem]
assert n_state % config.n_head == 0
self.n_ctx = n_ctx
self.n_head = config.n_head
self.split_size = n_state
self.scale = scale
self.c_attn = TFConv1D(n_state * 3, nx, initializer_range=config.initializer_range, name='c_attn')
self.c_proj = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_proj')
self.attn_dropout = tf.keras.layers.Dropout(config.attn_pdrop)
self.resid_dropout = tf.keras.layers.Dropout(config.resid_pdrop)
self.pruned_heads = set()
def prune_heads(self, heads):
pass
@staticmethod
def causal_attention_mask(nd, ns, dtype):
"""1's in the lower triangle, counting from the lower right corner.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
"""
i = tf.range(nd)[:,None]
j = tf.range(ns)
m = i >= j - ns + nd
return tf.cast(m, dtype)
def _attn(self, inputs, training=False):
q, k, v, attention_mask, head_mask = inputs
# q, k, v have shape [batch, heads, sequence, features]
w = tf.matmul(q, k, transpose_b=True)
if self.scale:
dk = tf.cast(tf.shape(k)[-1], tf.float32) # scale attention_scores
w = w / tf.math.sqrt(dk)
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
_, _, nd, ns = shape_list(w)
b = self.causal_attention_mask(nd, ns, dtype=w.dtype)
b = tf.reshape(b, [1, 1, nd, ns])
w = w * b - 1e4 * (1 - b)
if attention_mask is not None:
# Apply the attention mask
w = w + attention_mask
w = tf.nn.softmax(w, axis=-1)
w = self.attn_dropout(w, training=training)
# Mask heads if we want to
if head_mask is not None:
w = w * head_mask
outputs = [tf.matmul(w, v)]
if self.output_attentions:
outputs.append(w)
return outputs
def merge_heads(self, x):
x = tf.transpose(x, [0, 2, 1, 3])
x_shape = shape_list(x)
new_x_shape = x_shape[:-2] + [x_shape[-2] * x_shape[-1]]
return tf.reshape(x, new_x_shape)
def split_heads(self, x):
x_shape = shape_list(x)
new_x_shape = x_shape[:-1] + [self.n_head, x_shape[-1] // self.n_head]
x = tf.reshape(x, new_x_shape)
return tf.transpose(x, (0, 2, 1, 3)) # (batch, head, seq_length, head_features)
def call(self, inputs, training=False):
x, attention_mask, head_mask = inputs
x = self.c_attn(x)
query, key, value = tf.split(x, 3, axis=2)
query = self.split_heads(query)
key = self.split_heads(key)
value = self.split_heads(value)
attn_outputs = self._attn([query, key, value, attention_mask, head_mask], training=training)
a = attn_outputs[0]
a = self.merge_heads(a)
a = self.c_proj(a)
a = self.resid_dropout(a, training=training)
outputs = [a] + attn_outputs[1:]
return outputs # a, (attentions)
class TFMLP(tf.keras.layers.Layer):
def __init__(self, n_state, config, **kwargs):
super(TFMLP, self).__init__(**kwargs)
nx = config.n_embd
self.c_fc = TFConv1D(n_state, nx, initializer_range=config.initializer_range, name='c_fc')
self.c_proj = TFConv1D(nx, n_state, initializer_range=config.initializer_range, name='c_proj')
self.act = gelu
self.dropout = tf.keras.layers.Dropout(config.resid_pdrop)
def call(self, x, training=False):
h = self.act(self.c_fc(x))
h2 = self.c_proj(h)
h2 = self.dropout(h2, training=training)
return h2
class TFBlock(tf.keras.layers.Layer):
def __init__(self, n_ctx, config, scale=False, **kwargs):
super(TFBlock, self).__init__(**kwargs)
nx = config.n_embd
self.attn = TFAttention(nx, n_ctx, config, scale, name='attn')
self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_1')
self.mlp = TFMLP(4 * nx, config, name='mlp')
self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_2')
def call(self, inputs, training=False):
x, attention_mask, head_mask = inputs
output_attn = self.attn([x, attention_mask, head_mask], training=training)
a = output_attn[0] # output_attn: a, (attentions)
n = self.ln_1(x + a)
m = self.mlp(n, training=training)
h = self.ln_2(n + m)
outputs = [h] + output_attn[1:]
return outputs # x, (attentions)
class TFOpenAIGPTMainLayer(tf.keras.layers.Layer):
def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTMainLayer, self).__init__(config, *inputs, **kwargs)
self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions
self.num_hidden_layers = config.n_layer
self.vocab_size = config.vocab_size
self.n_embd = config.n_embd
self.tokens_embed = TFSharedEmbeddings(config.vocab_size,
config.n_embd,
initializer_range=config.initializer_range,
name='tokens_embed')
self.positions_embed = tf.keras.layers.Embedding(config.n_positions,
config.n_embd,
embeddings_initializer=get_initializer(config.initializer_range),
name='positions_embed')
self.drop = tf.keras.layers.Dropout(config.embd_pdrop)
self.h = [TFBlock(config.n_ctx,
config,
scale=True,
name='h_._{}'.format(i)) for i in range(config.n_layer)]
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
"""
raise NotImplementedError
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask
assert len(inputs) <= 5, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 5, "Too many inputs."
else:
input_ids = inputs
if position_ids is None:
position_ids = tf.range(shape_list(input_ids)[-1], dtype=tf.int32)[tf.newaxis, :]
if attention_mask is not None:
# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_mask = tf.cast(attention_mask, tf.float32)
attention_mask = (1.0 - attention_mask) * -10000.0
else:
attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
if not head_mask is None:
raise NotImplementedError
else:
head_mask = [None] * self.num_hidden_layers
# head_mask = tf.constant([0] * self.num_hidden_layers)
input_shape = shape_list(input_ids)
input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
inputs_embeds = self.tokens_embed(input_ids, mode='embedding')
position_embeds = self.positions_embed(position_ids)
if token_type_ids is not None:
token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
token_type_embeds = self.tokens_embed(token_type_ids, mode='embedding')
else:
token_type_embeds = 0
hidden_states = inputs_embeds + position_embeds + token_type_embeds
hidden_states = self.drop(hidden_states, training=training)
output_shape = input_shape + [shape_list(hidden_states)[-1]]
all_attentions = []
all_hidden_states = ()
for i, block in enumerate(self.h):
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
outputs = block([hidden_states, attention_mask, head_mask[i]], training=training)
hidden_states = outputs[0]
if self.output_attentions:
all_attentions.append(outputs[1])
hidden_states = tf.reshape(hidden_states, output_shape)
# Add last hidden state
if self.output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
outputs = (hidden_states,)
if self.output_hidden_states:
outputs = outputs + (all_hidden_states,)
if self.output_attentions:
# let the number of heads free (-1) so we can extract attention even after head pruning
attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
outputs = outputs + (all_attentions,)
return outputs # last hidden state, (all hidden_states), (attentions)
class TFOpenAIGPTPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = OpenAIGPTConfig
pretrained_model_archive_map = TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_openai_gpt_pt_weights_in_tf2
base_model_prefix = "transformer"
OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in
`Improving Language Understanding by Generative Pre-Training`_
by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
It's a causal (unidirectional) transformer pre-trained using language modeling on a large
corpus will long range dependencies, the Toronto Book Corpus.
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`Improving Language Understanding by Generative Pre-Training`:
https://openai.com/blog/language-unsupervised/
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Note on the model inputs:
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
**input_ids**: ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
Indices can be obtained using :class:`transformers.BPT2Tokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
**position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTModel.from_pretrained('openai-gpt')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs)
return outputs
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTLMHeadModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTLMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
hidden_states = transformer_outputs[0]
lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
outputs = (lm_logits,) + transformer_outputs[1:]
return outputs # lm_logits, (all hidden_states), (attentions)
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
The language modeling head has its weights tied to the input embeddings,
the classification head takes as input the input of a specified classification token index in the input sequence).
""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
r"""
**mc_token_ids**: (`optional`, default to index of the last token of the input) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_choices)``:
Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
# Add a [CLS] to the vocabulary (we should train it also!)
# This option is currently not implemented in TF 2.0
raise NotImplementedError
tokenizer.add_special_tokens({'cls_token': '[CLS]'})
model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size
print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :] # Batch size 1, 2 choices
mc_token_ids = tf.constant([input_ids.size(-1), input_ids.size(-1)])[None, :] # Batch size 1
outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, mc_token_ids=None, training=False):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
position_ids = inputs[3] if len(inputs) > 3 else position_ids
head_mask = inputs[4] if len(inputs) > 4 else head_mask
mc_token_ids = inputs[5] if len(inputs) > 5 else mc_token_ids
assert len(inputs) <= 6, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
head_mask = inputs.get('head_mask', head_mask)
mc_token_ids = inputs.get('mc_token_ids', mc_token_ids)
assert len(inputs) <= 6, "Too many inputs."
else:
input_ids = inputs
input_shapes = shape_list(input_ids)
seq_length = input_shapes[-1]
flat_input_ids = tf.reshape(input_ids, (-1, seq_length))
flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
flat_inputs = [flat_input_ids, flat_attention_mask, flat_token_type_ids, flat_position_ids, head_mask]
transformer_outputs = self.transformer(flat_inputs, training=training)
hidden_states = transformer_outputs[0]
hidden_states = tf.reshape(hidden_states, input_shapes + shape_list(hidden_states)[-1:])
lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
mc_logits = self.multiple_choice_head([hidden_states, mc_token_ids], training=training)
mc_logits = tf.squeeze(mc_logits, axis=-1)
outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
return outputs # lm logits, mc logits, (all hidden_states), (attentions)
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch - TF 2.0 general utilities."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging
import os
import re
import numpy
logger = logging.getLogger(__name__)
DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
def convert_tf_weight_name_to_pt_weight_name(tf_name, start_prefix_to_remove=''):
""" Convert a TF 2.0 model variable name in a pytorch model weight name.
Conventions for TF2.0 scopes -> PyTorch attribute names conversions:
- '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
- '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
return tuple with:
- pytorch model weight name
- transpose: boolean indicating weither TF2.0 and PyTorch weights matrices are transposed with regards to each other
"""
tf_name = tf_name.replace(':0', '') # device ids
tf_name = re.sub(r'/[^/]*___([^/]*)/', r'/\1/', tf_name) # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch)
tf_name = tf_name.replace('_._', '/') # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList)
tf_name = re.sub(r'//+', '/', tf_name) # Remove empty levels at the end
tf_name = tf_name.split('/') # Convert from TF2.0 '/' separators to PyTorch '.' separators
tf_name = tf_name[1:] # Remove level zero
# When should we transpose the weights
transpose = bool(tf_name[-1] == 'kernel' or 'emb_projs' in tf_name or 'out_projs' in tf_name)
# Convert standard TF2.0 names in PyTorch names
if tf_name[-1] == 'kernel' or tf_name[-1] == 'embeddings' or tf_name[-1] == 'gamma':
tf_name[-1] = 'weight'
if tf_name[-1] == 'beta':
tf_name[-1] = 'bias'
# Remove prefix if needed
tf_name = '.'.join(tf_name)
if start_prefix_to_remove:
tf_name = tf_name.replace(start_prefix_to_remove, '', 1)
return tf_name, transpose
#####################
### PyTorch => TF 2.0
def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
""" Load pytorch checkpoints in a TF 2.0 model
"""
try:
import tensorflow as tf
import torch
except ImportError as e:
logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
raise e
pt_path = os.path.abspath(pytorch_checkpoint_path)
logger.info("Loading PyTorch weights from {}".format(pt_path))
pt_state_dict = torch.load(pt_path, map_location='cpu')
return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False):
""" Load pytorch checkpoints in a TF 2.0 model
"""
pt_state_dict = pt_model.state_dict()
return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, allow_missing_keys=False):
""" Load pytorch state_dict in a TF 2.0 model.
"""
try:
import torch
import tensorflow as tf
from tensorflow.python.keras import backend as K
except ImportError as e:
logger.error("Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
raise e
if tf_inputs is None:
tf_inputs = tf.constant(DUMMY_INPUTS)
if tf_inputs is not None:
tfo = tf_model(tf_inputs, training=False) # Make sure model is built
# Adapt state dict - TODO remove this and update the AWS weights files instead
# Convert old format to new format if needed from a PyTorch state_dict
old_keys = []
new_keys = []
for key in pt_state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
pt_state_dict[new_key] = pt_state_dict.pop(old_key)
# Make sure we are able to load PyTorch base models as well as derived models (with heads)
# TF models always have a prefix, some of PyTorch models (base ones) don't
start_prefix_to_remove = ''
if not any(s.startswith(tf_model.base_model_prefix) for s in pt_state_dict.keys()):
start_prefix_to_remove = tf_model.base_model_prefix + '.'
symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
weight_value_tuples = []
all_pytorch_weights = set(list(pt_state_dict.keys()))
for symbolic_weight in symbolic_weights:
sw_name = symbolic_weight.name
name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
# Find associated numpy array in pytorch model state dict
assert name in pt_state_dict, "{} not found in PyTorch model".format(name)
array = pt_state_dict[name].numpy()
if transpose:
array = numpy.transpose(array)
if len(symbolic_weight.shape) < len(array.shape):
array = numpy.squeeze(array)
elif len(symbolic_weight.shape) > len(array.shape):
array = numpy.expand_dims(array, axis=0)
try:
assert list(symbolic_weight.shape) == list(array.shape)
except AssertionError as e:
e.args += (symbolic_weight.shape, array.shape)
raise e
logger.info("Initialize TF weight {}".format(symbolic_weight.name))
weight_value_tuples.append((symbolic_weight, array))
all_pytorch_weights.discard(name)
K.batch_set_value(weight_value_tuples)
if tf_inputs is not None:
tfo = tf_model(tf_inputs, training=False) # Make sure restore ops are run
logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
return tf_model
#####################
### TF 2.0 => PyTorch
def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False):
""" Load TF 2.0 HDF5 checkpoint in a PyTorch model
We use HDF5 to easily do transfer learning
(see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357).
"""
try:
import tensorflow as tf
import torch
except ImportError as e:
logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
raise e
import transformers
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info("Loading TensorFlow weights from {}".format(tf_checkpoint_path))
# Instantiate and load the associated TF 2.0 model
tf_model_class_name = "TF" + pt_model.__class__.__name__ # Add "TF" at the beggining
tf_model_class = getattr(transformers, tf_model_class_name)
tf_model = tf_model_class(pt_model.config)
if tf_inputs is None:
tf_inputs = tf.constant(DUMMY_INPUTS)
if tf_inputs is not None:
tfo = tf_model(tf_inputs, training=False) # Make sure model is built
tf_model.load_weights(tf_checkpoint_path, by_name=True)
return load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=allow_missing_keys)
def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False):
""" Load TF 2.0 model in a pytorch model
"""
weights = tf_model.weights
return load_tf2_weights_in_pytorch_model(pt_model, weights, allow_missing_keys=allow_missing_keys)
def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False):
""" Load TF2.0 symbolic weights in a PyTorch model
"""
try:
import tensorflow as tf
import torch
except ImportError as e:
logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
raise e
new_pt_params_dict = {}
current_pt_params_dict = dict(pt_model.named_parameters())
# Make sure we are able to load PyTorch base models as well as derived models (with heads)
# TF models always have a prefix, some of PyTorch models (base ones) don't
start_prefix_to_remove = ''
if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()):
start_prefix_to_remove = pt_model.base_model_prefix + '.'
# Build a map from potential PyTorch weight names to TF 2.0 Variables
tf_weights_map = {}
for tf_weight in tf_weights:
pt_name, transpose = convert_tf_weight_name_to_pt_weight_name(tf_weight.name, start_prefix_to_remove=start_prefix_to_remove)
tf_weights_map[pt_name] = (tf_weight.numpy(), transpose)
all_tf_weights = set(list(tf_weights_map.keys()))
loaded_pt_weights_data_ptr = {}
for pt_weight_name, pt_weight in current_pt_params_dict.items():
# Handle PyTorch shared weight ()not duplicated in TF 2.0
if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()]
continue
# Find associated numpy array in pytorch model state dict
if pt_weight_name not in tf_weights_map:
raise ValueError("{} not found in TF 2.0 model".format(pt_weight_name))
array, transpose = tf_weights_map[pt_weight_name]
if transpose:
array = numpy.transpose(array)
if len(pt_weight.shape) < len(array.shape):
array = numpy.squeeze(array)
elif len(pt_weight.shape) > len(array.shape):
array = numpy.expand_dims(array, axis=0)
try:
assert list(pt_weight.shape) == list(array.shape)
except AssertionError as e:
e.args += (pt_weight.shape, array.shape)
raise e
logger.info("Initialize PyTorch weight {}".format(pt_weight_name))
new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
all_tf_weights.discard(pt_weight_name)
missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
if len(missing_keys) > 0:
logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
pt_model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
logger.info("Weights from TF 2.0 model not used in {}: {}".format(
pt_model.__class__.__name__, unexpected_keys))
logger.info("Weights or buffers not loaded from TF 2.0 model: {}".format(all_tf_weights))
return pt_model
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 RoBERTa model. """
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging
import numpy as np
import tensorflow as tf
from .configuration_roberta import RobertaConfig
from .modeling_tf_utils import TFPreTrainedModel, get_initializer
from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu, gelu_new
logger = logging.getLogger(__name__)
TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
}
def load_roberta_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
class TFRobertaEmbeddings(TFBertEmbeddings):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
"""
def __init__(self, config, **kwargs):
super(TFRobertaEmbeddings, self).__init__(config, **kwargs)
self.padding_idx = 1
def _embedding(self, inputs, training=False):
"""Applies embedding based on inputs tensor."""
input_ids, position_ids, token_type_ids = inputs
seq_length = tf.shape(input_ids)[1]
if position_ids is None:
position_ids = tf.range(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=tf.int32)[tf.newaxis, :]
return super(TFRobertaEmbeddings, self)._embedding([input_ids, position_ids, token_type_ids], training=training)
class TFRobertaMainLayer(TFBertMainLayer):
"""
Same as TFBertMainLayer but uses TFRobertaEmbeddings.
"""
def __init__(self, config, **kwargs):
super(TFRobertaMainLayer, self).__init__(config, **kwargs)
self.embeddings = TFRobertaEmbeddings(config, name='embeddings')
def call(self, inputs, **kwargs):
# Check that input_ids starts with control token
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
else:
input_ids = inputs
if tf.not_equal(tf.reduce_sum(input_ids[:, 0]), 0):
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
"This model requires special tokens in order to work. "
"Please specify add_special_tokens=True in your encoding.")
return super(TFRobertaMainLayer, self).call(inputs, **kwargs)
class TFRobertaPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = RobertaConfig
pretrained_model_archive_map = TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_roberta_pt_weights_in_tf2
base_model_prefix = "roberta"
ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
`RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
Veselin Stoyanov. It is based on Google's BERT model released in 2018.
It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
objective and training with much larger mini-batches and learning rates.
This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained
models.
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`:
https://arxiv.org/abs/1907.11692
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Note on the model inputs:
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
model. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
ROBERTA_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
To match pre-training, RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
(a) For sequence pairs:
``tokens: <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
(b) For single sequences:
``tokens: <s> the dog is hairy . </s>``
Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with
the ``add_special_tokens`` parameter set to ``True``.
RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**token_type_ids**: (`optional` need to be trained) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Optional segment token indices to indicate first and second portions of the inputs.
This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
during finetuning.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
corresponds to a `sentence B` token
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
**position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1[``.
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
class TFRobertaModel(TFRobertaPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the output of the last layer of the model.
**pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
Last layer hidden-state of the first token of the sequence (classification token)
further processed by a Linear layer and a Tanh activation function. The Linear
layer weights are trained from the next sentence prediction (classification)
objective during Bert pretraining. This output is usually *not* a good summary
of the semantic content of the input, you're often better with averaging or pooling
the sequence of hidden-states for the whole input sequence.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaModel.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFRobertaModel, self).__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, name='roberta')
def call(self, inputs, **kwargs):
outputs = self.roberta(inputs, **kwargs)
return outputs
class TFRobertaLMHead(tf.keras.layers.Layer):
"""Roberta Head for masked language modeling."""
def __init__(self, config, input_embeddings, **kwargs):
super(TFRobertaLMHead, self).__init__(**kwargs)
self.vocab_size = config.vocab_size
self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
name='dense')
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
self.act = tf.keras.layers.Activation(gelu)
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.decoder = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,),
initializer='zeros',
trainable=True,
name='bias')
super(TFRobertaLMHead, self).build(input_shape)
def call(self, features):
x = self.dense(features)
x = self.act(x)
x = self.layer_norm(x)
# project back to size of vocabulary with bias
x = self.decoder(x, mode="linear") + self.bias
return x
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
r"""
**masked_lm_labels**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Labels for computing the masked language modeling loss.
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]``
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``tf.Tensor`` of shape ``(1,)``:
Masked language modeling loss.
**prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForMaskedLM
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForMaskedLM.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids)
prediction_scores = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFRobertaForMaskedLM, self).__init__(config, *inputs, **kwargs)
self.roberta = TFRobertaMainLayer(config, name="roberta")
self.lm_head = TFRobertaLMHead(config, self.roberta.embeddings, name="lm_head")
def call(self, inputs, **kwargs):
outputs = self.roberta(inputs, **kwargs)
sequence_output = outputs[0]
prediction_scores = self.lm_head(sequence_output)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
return outputs # prediction_scores, (hidden_states), (attentions)
class TFRobertaClassificationHead(tf.keras.layers.Layer):
"""Head for sentence-level classification tasks."""
def __init__(self, config, **kwargs):
super(TFRobertaClassificationHead, self).__init__(config, **kwargs)
self.dense = tf.keras.layers.Dense(config.hidden_size,
kernel_initializer=get_initializer(config.initializer_range),
activation='tanh',
name="dense")
self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
self.out_proj = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name="out_proj")
def call(self, features, training=False):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x, training=training)
x = self.dense(x)
x = self.dropout(x, training=training)
x = self.out_proj(x)
return x
@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer
on top of the pooled output) e.g. for GLUE tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
tokenizer = RoertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.constant([1])[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFRobertaForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.roberta = TFRobertaMainLayer(config, name="roberta")
self.classifier = TFRobertaClassificationHead(config, name="classifier")
def call(self, inputs, **kwargs):
outputs = self.roberta(inputs, **kwargs)
sequence_output = outputs[0]
logits = self.classifier(sequence_output, training=kwargs.get('training', False))
outputs = (logits,) + outputs[2:]
return outputs # logits, (hidden_states), (attentions)
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 Transformer XL model.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import json
import math
import logging
import collections
import sys
from io import open
import numpy as np
import tensorflow as tf
from .configuration_transfo_xl import TransfoXLConfig
from .modeling_tf_utils import TFPreTrainedModel, TFConv1D, TFSequenceSummary, shape_list, get_initializer
from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__)
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = {
'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-tf_model.h5",
}
def load_transfo_xl_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
class TFPositionalEmbedding(tf.keras.layers.Layer):
def __init__(self, demb, **kwargs):
super(TFPositionalEmbedding, self).__init__(**kwargs)
self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
def call(self, pos_seq, bsz=None):
sinusoid_inp = tf.einsum('i,j->ij', pos_seq, self.inv_freq)
pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
if bsz is not None:
return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
else:
return pos_emb[:, None, :]
class TFPositionwiseFF(tf.keras.layers.Layer):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
super(TFPositionwiseFF, self).__init__(**kwargs)
self.d_model = d_model
self.d_inner = d_inner
self.dropout = dropout
self.layer_1 = tf.keras.layers.Dense(d_inner,
kernel_initializer=get_initializer(init_std),
activation=tf.nn.relu,
name='CoreNet_._0')
self.drop_1 = tf.keras.layers.Dropout(dropout)
self.layer_2 = tf.keras.layers.Dense(d_model,
kernel_initializer=get_initializer(init_std),
name='CoreNet_._3')
self.drop_2 = tf.keras.layers.Dropout(dropout)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
self.pre_lnorm = pre_lnorm
def call(self, inp, training=False):
if self.pre_lnorm:
##### layer normalization + positionwise feed-forward
core_out = self.layer_norm(inp)
core_out = self.layer_1(core_out)
core_out = self.drop_1(core_out, training=training)
core_out = self.layer_2(core_out)
core_out = self.drop_2(core_out, training=training)
##### residual connection
output = core_out + inp
else:
##### positionwise feed-forward
core_out = self.layer_1(inp)
core_out = self.drop_1(core_out, training=training)
core_out = self.layer_2(core_out)
core_out = self.drop_2(core_out, training=training)
##### residual connection + layer normalization
output = self.layer_norm(inp + core_out)
return output
class TFRelPartialLearnableMultiHeadAttn(tf.keras.layers.Layer):
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
r_r_bias=None, r_w_bias=None, output_attentions=False,
layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
super(TFRelPartialLearnableMultiHeadAttn, self).__init__(**kwargs)
self.output_attentions = output_attentions
self.n_head = n_head
self.d_model = d_model
self.d_head = d_head
self.dropout = dropout
self.qkv_net = tf.keras.layers.Dense(3 * n_head * d_head,
kernel_initializer=get_initializer(init_std),
use_bias=False,
name='qkv_net')
self.drop = tf.keras.layers.Dropout(dropout)
self.dropatt = tf.keras.layers.Dropout(dropatt)
self.o_net = tf.keras.layers.Dense(d_model,
kernel_initializer=get_initializer(init_std),
use_bias=False,
name='o_net')
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name='layer_norm')
self.scale = 1 / (d_head ** 0.5)
self.pre_lnorm = pre_lnorm
if r_r_bias is not None and r_w_bias is not None: # Biases are shared
self.r_r_bias = r_r_bias
self.r_w_bias = r_w_bias
else:
self.r_r_bias = None
self.r_w_bias = None
self.r_net = tf.keras.layers.Dense(self.n_head * self.d_head,
kernel_initializer=get_initializer(init_std),
use_bias=False,
name='r_net')
def build(self, input_shape):
if self.r_r_bias is None or self.r_w_bias is None: # Biases are not shared
self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True,
name='r_r_bias')
self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True,
name='r_w_bias')
super(TFRelPartialLearnableMultiHeadAttn, self).build(input_shape)
def _rel_shift(self, x):
x_size = shape_list(x)
x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
x = tf.reshape(x, x_size)
return x
def call(self, inputs, training=False):
w, r, attn_mask, mems, head_mask = inputs
qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1]
if mems is not None:
cat = tf.concat([mems, w], 0)
if self.pre_lnorm:
w_heads = self.qkv_net(self.layer_norm(cat))
else:
w_heads = self.qkv_net(cat)
r_head_k = self.r_net(r)
w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
w_head_q = w_head_q[-qlen:]
else:
if self.pre_lnorm:
w_heads = self.qkv_net(self.layer_norm(w))
else:
w_heads = self.qkv_net(w)
r_head_k = self.r_net(r)
w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
klen = shape_list(w_head_k)[0]
w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head
w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head
w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head
r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head)) # qlen x n_head x d_head
#### compute attention score
rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head
AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k) # qlen x klen x bsz x n_head
rr_head_q = w_head_q + self.r_r_bias
BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k) # qlen x klen x bsz x n_head
BD = self._rel_shift(BD)
# [qlen x klen x bsz x n_head]
attn_score = AC + BD
attn_score = attn_score * self.scale
#### compute attention probability
if attn_mask is not None:
attn_mask_t = attn_mask[:, :, None, None]
attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
# [qlen x klen x bsz x n_head]
attn_prob = tf.nn.softmax(attn_score, axis=1)
attn_prob = self.dropatt(attn_prob, training=training)
# Mask heads if we want to
if head_mask is not None:
attn_prob = attn_prob * head_mask
#### compute attention vector
attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)
# [qlen x bsz x n_head x d_head]
attn_vec_sizes = shape_list(attn_vec)
attn_vec = tf.reshape(attn_vec,
(attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
##### linear projection
attn_out = self.o_net(attn_vec)
attn_out = self.drop(attn_out, training=training)
if self.pre_lnorm:
##### residual connection
outputs = [w + attn_out]
else:
##### residual connection + layer normalization
outputs = [self.layer_norm(w + attn_out)]
if self.output_attentions:
outputs.append(attn_prob)
return outputs
class TFRelPartialLearnableDecoderLayer(tf.keras.layers.Layer):
def __init__(self, n_head, d_model, d_head, d_inner, dropout,
tgt_len=None, ext_len=None, mem_len=None,
dropatt=0., pre_lnorm=False,
r_w_bias=None,
r_r_bias=None,
output_attentions=False,
layer_norm_epsilon=1e-5,
init_std=0.02,
**kwargs):
super(TFRelPartialLearnableDecoderLayer, self).__init__(**kwargs)
self.dec_attn = TFRelPartialLearnableMultiHeadAttn(n_head, d_model,
d_head, dropout, tgt_len=tgt_len, ext_len=ext_len,
mem_len=mem_len, dropatt=dropatt, pre_lnorm=pre_lnorm,
r_w_bias=r_w_bias, r_r_bias=r_r_bias, init_std=init_std,
output_attentions=output_attentions,
layer_norm_epsilon=layer_norm_epsilon, name='dec_attn')
self.pos_ff = TFPositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=pre_lnorm, init_std=init_std,
layer_norm_epsilon=layer_norm_epsilon,
name='pos_ff')
def call(self, inputs, training=False):
dec_inp, r, dec_attn_mask, mems, head_mask = inputs
attn_outputs = self.dec_attn([dec_inp, r, dec_attn_mask,
mems, head_mask], training=training)
ff_output = self.pos_ff(attn_outputs[0], training=training)
outputs = [ff_output] + attn_outputs[1:]
return outputs
class TFAdaptiveEmbedding(tf.keras.layers.Layer):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02,
sample_softmax=False, **kwargs):
super(TFAdaptiveEmbedding, self).__init__(**kwargs)
self.n_token = n_token
self.d_embed = d_embed
self.init_std = init_std
self.cutoffs = cutoffs + [n_token]
self.div_val = div_val
self.d_proj = d_proj
self.emb_scale = d_proj ** 0.5
self.cutoff_ends = [0] + self.cutoffs
self.emb_layers = []
self.emb_projs = []
if div_val == 1:
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
else:
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
d_emb_i = d_embed // (div_val ** i)
self.emb_layers.append(tf.keras.layers.Embedding(r_idx-l_idx,
d_emb_i,
embeddings_initializer=get_initializer(init_std),
name='emb_layers_._{}'.format(i)))
def build(self, input_shape):
for i in range(len(self.cutoffs)):
d_emb_i = self.d_embed // (self.div_val ** i)
self.emb_projs.append(self.add_weight(shape=(d_emb_i, self.d_proj),
initializer=get_initializer(self.init_std),
trainable=True,
name='emb_projs_._{}'.format(i)))
super(TFAdaptiveEmbedding, self).build(input_shape)
def call(self, inp):
if self.div_val == 1:
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
else:
inp_flat = tf.reshape(inp, (-1,))
emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj])
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
emb_i = self.emb_layers[i](inp_i)
emb_i = tf.einsum('id,de->ie', emb_i, self.emb_projs[i])
mask_idx = tf.cast(tf.where(mask_i), dtype=tf.int64)
emb_flat += tf.scatter_nd(mask_idx, emb_i, tf.cast(tf.shape(emb_flat), dtype=tf.int64))
embed_shape = shape_list(inp) + [self.d_proj]
embed = tf.reshape(emb_flat, embed_shape)
embed *= self.emb_scale
return embed
class TFTransfoXLMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFTransfoXLMainLayer, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.n_token = config.n_token
self.d_embed = config.d_embed
self.d_model = config.d_model
self.n_head = config.n_head
self.d_head = config.d_head
self.untie_r = config.untie_r
self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
div_val=config.div_val, init_std=config.init_std, name='word_emb')
self.drop = tf.keras.layers.Dropout(config.dropout)
self.n_layer = config.n_layer
self.tgt_len = config.tgt_len
self.mem_len = config.mem_len
self.ext_len = config.ext_len
self.max_klen = config.tgt_len + config.ext_len + config.mem_len
self.attn_type = config.attn_type
self.layers = []
if config.attn_type == 0: # the default attention
for i in range(config.n_layer):
self.layers.append(
TFRelPartialLearnableDecoderLayer(
config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
r_w_bias=None if self.untie_r else self.r_w_bias,
r_r_bias=None if self.untie_r else self.r_r_bias,
output_attentions=self.output_attentions,
layer_norm_epsilon=config.layer_norm_epsilon,
init_std=config.init_std,
name='layers_._{}'.format(i))
)
else: # learnable embeddings and absolute embeddings
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
self.same_length = config.same_length
self.clamp_len = config.clamp_len
if self.attn_type == 0: # default attention
self.pos_emb = TFPositionalEmbedding(self.d_model, name='pos_emb')
else: # learnable embeddings and absolute embeddings
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
def build(self, input_shape):
if not self.untie_r:
self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True,
name='r_w_bias')
self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True,
name='r_r_bias')
super(TFTransfoXLMainLayer, self).build(input_shape)
def _resize_token_embeddings(self, new_num_tokens):
return self.word_emb
def backward_compatible(self):
self.sample_softmax = -1
def reset_length(self, tgt_len, ext_len, mem_len):
self.tgt_len = tgt_len
self.mem_len = mem_len
self.ext_len = ext_len
def _prune_heads(self, heads):
raise NotImplementedError
def init_mems(self, data):
if self.mem_len > 0:
mems = []
for i in range(self.n_layer):
empty = tf.zeros([self.mem_len, shape_list(data)[1], self.d_model])
mems.append(empty)
return mems
else:
return None
def _update_mems(self, hids, mems, qlen, mlen):
# does not deal with None
if mems is None: return None
# mems is not None
assert len(hids) == len(mems), 'len(hids) != len(mems)'
# There are `mlen + qlen` steps that can be cached into mems
# For the next step, the last `ext_len` of the `qlen` tokens
# will be used as the extended context. Hence, we only cache
# the tokens from `mlen + qlen - self.ext_len - self.mem_len`
# to `mlen + qlen - self.ext_len`.
new_mems = []
end_idx = mlen + max(0, qlen - 0 - self.ext_len)
beg_idx = max(0, end_idx - self.mem_len)
for i in range(len(hids)):
cat = tf.concat([mems[i], hids[i]], axis=0)
tf.stop_gradient(cat)
new_mems.append(cat[beg_idx:end_idx])
return new_mems
def call(self, inputs, mems=None, head_mask=None, training=False):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
mems = inputs[1] if len(inputs) > 1 else mems
head_mask = inputs[2] if len(inputs) > 2 else head_mask
assert len(inputs) <= 3, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
mems = inputs.get('mems', mems)
head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 3, "Too many inputs."
else:
input_ids = inputs
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = tf.transpose(input_ids, perm=(1, 0))
if mems is None:
mems = self.init_mems(input_ids)
qlen, bsz = shape_list(input_ids)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
# and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
if not head_mask is None:
raise NotImplementedError
else:
head_mask = [None] * self.n_layer
word_emb = self.word_emb(input_ids)
mlen = shape_list(mems[0])[0] if mems is not None else 0
klen = mlen + qlen
attn_mask = tf.ones([qlen, qlen])
mask_u = tf.linalg.band_part(attn_mask, 0, -1)
mask_dia = tf.linalg.band_part(attn_mask, 0, 0)
attn_mask_pad = tf.zeros([qlen, mlen])
dec_attn_mask = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
if self.same_length:
mask_l = tf.linalg.band_part(attn_mask, -1, 0)
dec_attn_mask = tf.concat([dec_attn_mask[:, :qlen] + mask_l - mask_dia,
dec_attn_mask[:, qlen:]], 1)
# ::: PyTorch masking code for reference :::
# if self.same_length:
# all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
# mask_len = klen - self.mem_len
# if mask_len > 0:
# mask_shift_len = qlen - mask_len
# else:
# mask_shift_len = qlen
# dec_attn_mask = (torch.triu(all_ones, 1+mlen)
# + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1
# else:
# dec_attn_mask = torch.triu(
# word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None]
hids = []
attentions = []
if self.attn_type == 0: # default
pos_seq = tf.range(klen-1, -1, -1.0)
if self.clamp_len > 0:
pos_seq = tf.minimum(pos_seq, self.clamp_len)
pos_emb = self.pos_emb(pos_seq)
core_out = self.drop(word_emb, training=training)
pos_emb = self.drop(pos_emb, training=training)
for i, layer in enumerate(self.layers):
hids.append(core_out)
mems_i = None if mems is None else mems[i]
layer_outputs = layer([core_out, pos_emb, dec_attn_mask,
mems_i, head_mask[i]], training=training)
core_out = layer_outputs[0]
if self.output_attentions:
attentions.append(layer_outputs[1])
else: # learnable embeddings and absolute embeddings
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
core_out = self.drop(core_out, training=training)
new_mems = self._update_mems(hids, mems, mlen, qlen)
# We transpose back here to shape [bsz, len, hidden_dim]
outputs = [tf.transpose(core_out, perm=(1, 0, 2)), new_mems]
if self.output_hidden_states:
# Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
hids.append(core_out)
hids = list(tf.transpose(t, perm=(1, 0, 2)) for t in hids)
outputs.append(hids)
if self.output_attentions:
# Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
attentions = list(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
outputs.append(attentions)
return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = TransfoXLConfig
pretrained_model_archive_map = TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_transfo_xl_pt_weights_in_tf2
base_model_prefix = "transformer"
TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in
`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
previously computed hidden-states to attend to longer context (memory).
This model also uses adaptive softmax inputs and outputs (tied).
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`:
https://arxiv.org/abs/1901.02860
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Note on the model inputs:
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
TRANSFO_XL_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
the right or on the left.
Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mems**: (`optional`)
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**mems**:
list of ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import TransfoXLTokenizer, TFTransfoXLModel
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states, mems = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFTransfoXLModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFTransfoXLMainLayer(config, name='transformer')
def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs)
return outputs
@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
(adaptive softmax with weights tied to the adaptive input embeddings)""",
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**prediction_scores**: ``None`` if ``lm_labels`` is provided else ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
We don't output them when the loss is computed to speedup adaptive softmax decoding.
**mems**:
list of ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import TransfoXLTokenizer, TFTransfoXLLMHeadModel
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
prediction_scores, mems = outputs[:2]
"""
def __init__(self, config):
super(TFTransfoXLLMHeadModel, self).__init__(config)
self.transformer = TFTransfoXLMainLayer(config, name='transformer')
self.sample_softmax = config.sample_softmax
# use sampled softmax
if config.sample_softmax > 0:
raise NotImplementedError
# use adaptive softmax (including standard softmax)
else:
self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model,
config.cutoffs, div_val=config.div_val, name='crit')
def reset_length(self, tgt_len, ext_len, mem_len):
self.transformer.reset_length(tgt_len, ext_len, mem_len)
def init_mems(self, data):
return self.transformer.init_mems(data)
def call(self, inputs, mems=None, head_mask=None, labels=None, training=False):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
mems = inputs[1] if len(inputs) > 1 else mems
head_mask = inputs[2] if len(inputs) > 2 else head_mask
labels = inputs[3] if len(inputs) > 3 else labels
assert len(inputs) <= 4, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
mems = inputs.get('mems', mems)
head_mask = inputs.get('head_mask', head_mask)
labels = inputs.get('labels', labels)
assert len(inputs) <= 4, "Too many inputs."
else:
input_ids = inputs
bsz, tgt_len = shape_list(input_ids)[:2]
transformer_outputs = self.transformer([input_ids, mems, head_mask], training=training)
last_hidden = transformer_outputs[0]
pred_hid = last_hidden[:, -tgt_len:]
outputs = transformer_outputs[1:]
if self.sample_softmax > 0 and training:
raise NotImplementedError
else:
# pred_hid = tf.reshape(pred_hid, (-1, shape_list(pred_hid)[-1]))
softmax_output = self.crit([pred_hid, labels], training=training)
# softmax_output = tf.reshape(softmax_output, (bsz, tgt_len, -1))
outputs = [softmax_output] + outputs
return outputs # logits, new_mems, (all hidden states), (all attentions)
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" A TF 2.0 Adaptive Softmax for Transformer XL model.
"""
from collections import defaultdict
import numpy as np
import tensorflow as tf
from .modeling_tf_utils import shape_list
class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
keep_order=False, **kwargs):
super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
self.n_token = n_token
self.d_embed = d_embed
self.d_proj = d_proj
self.cutoffs = cutoffs + [n_token]
self.cutoff_ends = [0] + self.cutoffs
self.div_val = div_val
self.shortlist_size = self.cutoffs[0]
self.n_clusters = len(self.cutoffs) - 1
self.head_size = self.shortlist_size + self.n_clusters
self.keep_order = keep_order
self.out_layers = []
self.out_projs = []
def build(self, input_shape):
if self.n_clusters > 0:
self.cluster_weight = self.add_weight(shape=(self.n_clusters, self.d_embed),
initializer='zeros',
trainable=True,
name='cluster_weight')
self.cluster_bias = self.add_weight(shape=(self.n_clusters,),
initializer='zeros',
trainable=True,
name='cluster_bias')
if self.div_val == 1:
for i in range(len(self.cutoffs)):
if self.d_proj != self.d_embed:
weight = self.add_weight(shape=(self.d_embed, self.d_proj),
initializer='zeros',
trainable=True,
name='out_projs_._{}'.format(i))
self.out_projs.append(weight)
else:
self.out_projs.append(None)
weight = self.add_weight(shape=(self.n_token, self.d_embed,),
initializer='zeros',
trainable=True,
name='out_layers_._{}_._weight'.format(i))
bias = self.add_weight(shape=(self.n_token,),
initializer='zeros',
trainable=True,
name='out_layers_._{}_._bias'.format(i))
self.out_layers.append((weight, bias))
else:
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
d_emb_i = self.d_embed // (self.div_val ** i)
weight = self.add_weight(shape=(d_emb_i, self.d_proj),
initializer='zeros',
trainable=True,
name='out_projs_._{}'.format(i))
self.out_projs.append(weight)
weight = self.add_weight(shape=(r_idx-l_idx, d_emb_i,),
initializer='zeros',
trainable=True,
name='out_layers_._{}_._weight'.format(i))
bias = self.add_weight(shape=(r_idx-l_idx,),
initializer='zeros',
trainable=True,
name='out_layers_._{}_._bias'.format(i))
self.out_layers.append((weight, bias))
super(TFAdaptiveSoftmaxMask, self).build(input_shape)
@staticmethod
def _logit(x, W, b, proj=None):
y = x
if proj is not None:
y = tf.einsum('ibd,ed->ibe', y, proj)
return tf.einsum('ibd,nd->ibn', y, W) + b
@staticmethod
def _gather_logprob(logprob, target):
lp_size = tf.shape(logprob)
r = tf.range(lp_size[0])
idx = tf.stack([r, target], 1)
return tf.gather_nd(logprob, idx)
def call(self, inputs, return_mean=True, training=False):
hidden, target = inputs
head_logprob = 0
if self.n_clusters == 0:
softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
if target is not None:
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
out = tf.nn.log_softmax(output, axis=-1)
else:
hidden_sizes = shape_list(hidden)
out = []
loss = tf.zeros(hidden_sizes[:2], dtype=tf.float32)
for i in range(len(self.cutoffs)):
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
if target is not None:
mask = (target >= l_idx) & (target < r_idx)
mask_idx = tf.where(mask)
cur_target = tf.boolean_mask(target, mask) - l_idx
if self.div_val == 1:
cur_W = self.out_layers[0][0][l_idx:r_idx]
cur_b = self.out_layers[0][1][l_idx:r_idx]
else:
cur_W = self.out_layers[i][0]
cur_b = self.out_layers[i][1]
if i == 0:
cur_W = tf.concat([cur_W, self.cluster_weight], 0)
cur_b = tf.concat([cur_b, self.cluster_bias], 0)
head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0])
head_logprob = tf.nn.log_softmax(head_logit)
out.append(head_logprob[..., :self.cutoffs[0]])
if target is not None:
cur_head_logprob = tf.boolean_mask(head_logprob, mask)
cur_logprob = self._gather_logprob(cur_head_logprob, cur_target)
else:
tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i])
tail_logprob = tf.nn.log_softmax(tail_logit)
cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster
logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob
out.append(logprob_i)
if target is not None:
cur_head_logprob = tf.boolean_mask(head_logprob, mask)
cur_tail_logprob = tf.boolean_mask(tail_logprob, mask)
cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
if target is not None:
loss += tf.scatter_nd(mask_idx, -cur_logprob, tf.cast(tf.shape(loss), dtype=tf.int64))
out = tf.concat(out, axis=-1)
if target is not None:
if return_mean:
loss = tf.reduce_mean(loss)
# Add the training-time loss value to the layer using `self.add_loss()`.
self.add_loss(loss)
# Log the loss as a metric (we could log arbitrary metrics,
# including different metrics for training and inference.
self.add_metric(loss, name=self.name, aggregation='mean' if return_mean else '')
return out
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TF general model utils."""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import logging
import os
import tensorflow as tf
from .configuration_utils import PretrainedConfig
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
logger = logging.getLogger(__name__)
class TFPreTrainedModel(tf.keras.Model):
r""" Base class for all TF models.
:class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
- ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
- ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
- ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
- ``path``: a path (string) to the TensorFlow checkpoint.
- ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
"""
config_class = None
pretrained_model_archive_map = {}
load_pt_weights = lambda model, config, path: None
base_model_prefix = ""
def __init__(self, config, *inputs, **kwargs):
super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
if not isinstance(config, PretrainedConfig):
raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
"To create a model from a pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
self.__class__.__name__, self.__class__.__name__
))
# Save config in model
self.config = config
def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
""" Build a resized Embedding Variable from a provided token Embedding Module.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
Args:
new_num_tokens: (`optional`) int
New number of tokens in the embedding matrix.
Increasing the size will add newly initialized vectors at the end
Reducing the size will remove vectors from the end
If not provided or None: return the provided token Embedding Module.
Return: ``tf.Variable``
Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
"""
# if new_num_tokens is None:
# return old_embeddings
# old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
# if old_num_tokens == new_num_tokens:
# return old_embeddings
# # Build new embeddings
# new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
# new_embeddings.to(old_embeddings.weight.device)
# # initialize all new embeddings (in particular added tokens)
# self._init_weights(new_embeddings)
# # Copy word embeddings from the previous weights
# num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
# new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
# return new_embeddings
def resize_token_embeddings(self, new_num_tokens=None):
""" Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
Arguments:
new_num_tokens: (`optional`) int:
New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model.
Return: ``tf.Variable``
Pointer to the input tokens Embeddings Module of the model
"""
raise NotImplementedError
def prune_heads(self, heads_to_prune):
""" Prunes heads of the base model.
Arguments:
heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
"""
raise NotImplementedError
def save_pretrained(self, save_directory):
""" Save a model and its configuration file to a directory, so that it
can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
"""
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
# Save configuration file
self.config.save_pretrained(save_directory)
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME)
self.save_weights(output_model_file)
logger.info("Model weights saved in {}".format(output_model_file))
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
To train the model, you should first set it back in training mode with ``model.train()``
The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
It is up to you to train those weights with a downstream fine-tuning task.
The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
Parameters:
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
from_pt: (`optional`) boolean, default False:
Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
configuration should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
kwargs: (`optional`) Remaining dictionary of keyword arguments:
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples::
model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache.
model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')`
model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading
assert model.config.output_attention == True
# Loading from a TF checkpoint file instead of a PyTorch model (slower)
config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)
"""
config = kwargs.pop('config', None)
cache_dir = kwargs.pop('cache_dir', None)
from_pt = kwargs.pop('from_pt', False)
force_download = kwargs.pop('force_download', False)
proxies = kwargs.pop('proxies', None)
# Load config
if config is None:
config, model_kwargs = cls.config_class.from_pretrained(
pretrained_model_name_or_path, *model_args,
cache_dir=cache_dir, return_unused_kwargs=True,
force_download=force_download,
**kwargs
)
else:
model_kwargs = kwargs
# Load model
if pretrained_model_name_or_path is not None:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path):
if os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
# Load from a TF 2.0 checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
# Load from a PyTorch checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
else:
raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
[WEIGHTS_NAME, TF2_WEIGHTS_NAME],
pretrained_model_name_or_path))
elif os.path.isfile(pretrained_model_name_or_path):
archive_file = pretrained_model_name_or_path
else:
raise EnvironmentError("Error file {} not found".format(pretrained_model_name_or_path))
# redirect to the cache, if necessary
try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError as e:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
logger.error(
"Couldn't reach server at '{}' to download pretrained weights.".format(
archive_file))
else:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(cls.pretrained_model_archive_map.keys()),
archive_file))
raise e
if resolved_archive_file == archive_file:
logger.info("loading weights file {}".format(archive_file))
else:
logger.info("loading weights file {} from cache at {}".format(
archive_file, resolved_archive_file))
else:
resolved_archive_file = None
# Instantiate model.
model = cls(config, *model_args, **model_kwargs)
if from_pt:
# Load from a PyTorch checkpoint
return cls.load_pt_weights(model, resolved_archive_file)
inputs = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
ret = model(inputs, training=False) # build the network with dummy inputs
assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
# 'by_name' allow us to do transfer learning by skipping/adding layers
# see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
model.load_weights(resolved_archive_file, by_name=True)
ret = model(inputs, training=False) # Make sure restore ops are run
return model
class TFConv1D(tf.keras.layers.Layer):
def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
""" TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
Basically works like a Linear layer but the weights are transposed
"""
super(TFConv1D, self).__init__(**kwargs)
self.nf = nf
self.nx = nx
self.initializer_range = initializer_range
def build(self, input_shape):
self.weight = self.add_weight(
"weight",
shape=[self.nx, self.nf],
initializer=get_initializer(self.initializer_range))
self.bias = self.add_weight(
"bias",
shape=[1, self.nf],
initializer=tf.zeros_initializer())
def call(self, x):
bz, sl = shape_list(x)[:2]
x = tf.reshape(x, [-1, self.nx])
x = tf.matmul(x, self.weight) + self.bias
x = tf.reshape(x, [bz, sl, self.nf])
return x
class TFSharedEmbeddings(tf.keras.layers.Layer):
"""Construct shared token embeddings.
"""
def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
super(TFSharedEmbeddings, self).__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
def build(self, input_shape):
"""Build shared word embedding layer
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
self.weight = self.add_weight(
"weight",
shape=[self.vocab_size, self.hidden_size],
initializer=get_initializer(self.initializer_range))
super(TFSharedEmbeddings, self).build(input_shape)
def call(self, inputs, mode="embedding"):
"""Get token embeddings of inputs.
Args:
inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
mode: string, a valid value is one of "embedding" and "linear".
Returns:
outputs: (1) If mode == "embedding", output embedding tensor, float32 with
shape [batch_size, length, embedding_size]; (2) mode == "linear", output
linear tensor, float32 with shape [batch_size, length, vocab_size].
Raises:
ValueError: if mode is not valid.
Shared weights logic adapted from
https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
"""
if mode == "embedding":
return self._embedding(inputs)
elif mode == "linear":
return self._linear(inputs)
else:
raise ValueError("mode {} is not valid.".format(mode))
def _embedding(self, input_ids):
"""Applies embedding based on inputs tensor."""
return tf.gather(self.weight, input_ids)
def _linear(self, inputs):
"""Computes logits by running inputs through a linear layer.
Args:
inputs: A float32 tensor with shape [..., hidden_size]
Returns:
float32 tensor with shape [..., vocab_size].
"""
first_dims = shape_list(inputs)[:-1]
x = tf.reshape(inputs, [-1, self.hidden_size])
logits = tf.matmul(x, self.weight, transpose_b=True)
return tf.reshape(logits, first_dims + [self.vocab_size])
class TFSequenceSummary(tf.keras.layers.Layer):
r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
Args of the config class:
summary_type:
- 'last' => [default] take the last token hidden state (like XLNet)
- 'first' => take the first token hidden state (like Bert)
- 'mean' => take the mean of all tokens hidden states
- 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention
summary_use_proj: Add a projection after the vector extraction
summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
summary_first_dropout: Add a dropout before the projection and activation
summary_last_dropout: Add a dropout after the projection and activation
"""
def __init__(self, config, initializer_range=0.02, **kwargs):
super(TFSequenceSummary, self).__init__(**kwargs)
self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
if self.summary_type == 'attn':
# We should use a standard multi-head attention module with absolute positional embedding for that.
# Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
# We can probably just use the multi-head attention module of PyTorch >=1.1.0
raise NotImplementedError
self.summary = None
if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
num_classes = config.num_labels
else:
num_classes = config.hidden_size
self.summary = tf.keras.layers.Dense(num_classes,
kernel_initializer=get_initializer(initializer_range),
name='summary')
self.activation = None
if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
self.activation = tf.keras.activations.tanh
self.first_dropout = None
if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
self.last_dropout = None
if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
def call(self, inputs, training=False):
""" hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
cls_index: [optional] position of the classification token if summary_type == 'cls_index',
shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
if summary_type == 'cls_index' and cls_index is None:
we take the last token of the sequence as classification token
"""
if not isinstance(inputs, (dict, tuple, list)):
hidden_states = inputs
cls_index = None
elif isinstance(inputs, (tuple, list)):
hidden_states = inputs[0]
cls_index = inputs[1] if len(inputs) > 1 else None
assert len(inputs) <= 2, "Too many inputs."
else:
input_ids = inputs.get('input_ids')
cls_index = inputs.get('cls_index', None)
if self.summary_type == 'last':
output = hidden_states[:, -1]
elif self.summary_type == 'first':
output = hidden_states[:, 0]
elif self.summary_type == 'mean':
output = tf.mean(hidden_states, axis=1)
elif self.summary_type == 'cls_index':
hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims]
if cls_index is None:
cls_index = tf.fill(hidden_shape[:-2], hidden_shape[-2] - 1) # A tensor full of shape [batch] or [batch, num choices] full of sequence length
cls_shape = shape_list(cls_index)
if len(cls_shape) <= len(hidden_shape) - 2:
cls_index = cls_index[..., tf.newaxis]
# else:
# cls_index = cls_index[..., tf.newaxis]
# cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
# shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2)
output = tf.squeeze(output, axis=len(hidden_shape) - 2) # shape of output: (batch, num choices, hidden_size)
elif self.summary_type == 'attn':
raise NotImplementedError
if training and self.first_dropout is not None:
output = self.first_dropout(output)
if self.summary is not None:
output = self.summary(output)
if self.activation is not None:
output = self.activation(output)
if training and self.last_dropout is not None:
output = self.last_dropout(output)
return output
def shape_list(x):
"""Deal with dynamic shape in tensorflow cleanly."""
static = x.shape.as_list()
dynamic = tf.shape(x)
return [dynamic[i] if s is None else s for i, s in enumerate(static)]
def get_initializer(initializer_range=0.02):
"""Creates a `tf.initializers.truncated_normal` with the given range.
Args:
initializer_range: float, initializer range for stddev.
Returns:
TruncatedNormal initializer with stddev = `initializer_range`.
"""
return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
# coding=utf-8
# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 XLM model.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import logging
import math
import os
import itertools
import numpy as np
import tensorflow as tf
from .configuration_xlm import XLMConfig
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer
from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__)
TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5",
'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5",
'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5",
'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5",
'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5",
'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5",
'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5",
'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5",
'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5",
'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5",
}
def load_xlm_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
# build the network
inputs_list = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
attns_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
if tf_model.config.use_lang_emb and tf_model.config.n_langs > 1:
langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
else:
langs_list = None
tf_inputs = [inputs_list, attns_list, langs_list]
tfo = tf_model(tf_inputs, training=False)
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
def create_sinusoidal_embeddings(n_pos, dim, out):
position_enc = np.array([
[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
for pos in range(n_pos)
])
out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2]))
out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2]))
def gelu(x):
""" Gaussian Error Linear Unit.
Original Implementation of the gelu activation function in Google Bert repo when initially created.
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
Also see https://arxiv.org/abs/1606.08415
"""
cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
return x * cdf
def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32):
"""
Generate hidden states mask, and optionally an attention mask.
"""
bs = shape_list(lengths)[0]
if padding_mask is not None:
mask = padding_mask
else:
# assert lengths.max().item() <= slen
alen = tf.range(slen)
mask = tf.math.less(alen, lengths[:, tf.newaxis])
# attention mask is the same as mask, or triangular inferior attention (causal)
if causal:
attn_mask = tf.less_equal(tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)),
alen[tf.newaxis, :, tf.newaxis])
else:
attn_mask = mask
# sanity check
assert shape_list(mask) == [bs, slen]
assert causal is False or shape_list(attn_mask) == [bs, slen, slen]
mask = tf.cast(mask, dtype=dtype)
attn_mask = tf.cast(attn_mask, dtype=dtype)
return mask, attn_mask
class TFMultiHeadAttention(tf.keras.layers.Layer):
NEW_ID = itertools.count()
def __init__(self, n_heads, dim, config, **kwargs):
super(TFMultiHeadAttention, self).__init__(**kwargs)
self.layer_id = next(TFMultiHeadAttention.NEW_ID)
self.output_attentions = config.output_attentions
self.dim = dim
self.n_heads = n_heads
assert self.dim % self.n_heads == 0
self.q_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='q_lin')
self.k_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='k_lin')
self.v_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='v_lin')
self.out_lin = tf.keras.layers.Dense(dim, kernel_initializer=get_initializer(config.init_std), name='out_lin')
self.dropout = tf.keras.layers.Dropout(config.attention_dropout)
self.pruned_heads = set()
def prune_heads(self, heads):
raise NotImplementedError
def call(self, inputs, training=False):
"""
Self-attention (if kv is None) or attention over source sentence (provided by kv).
"""
input, mask, kv, cache, head_mask = inputs
# Input is (bs, qlen, dim)
# Mask is (bs, klen) (non-causal) or (bs, klen, klen)
bs, qlen, dim = shape_list(input)
if kv is None:
klen = qlen if cache is None else cache['slen'] + qlen
else:
klen = shape_list(kv)[1]
# assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
n_heads = self.n_heads
dim_per_head = self.dim // n_heads
mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen)
def shape(x):
""" projection """
return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
def unshape(x):
""" compute context """
return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head)
if kv is None:
k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head)
v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head)
elif cache is None or self.layer_id not in cache:
k = v = kv
k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head)
v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head)
if cache is not None:
if self.layer_id in cache:
if kv is None:
k_, v_ = cache[self.layer_id]
k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head)
v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head)
else:
k, v = cache[self.layer_id]
cache[self.layer_id] = (k, v)
q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head)
scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen)
mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen)
# scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen)
scores = scores - 1e30 * (1.0 - mask)
weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen)
weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen)
# Mask heads if we want to
if head_mask is not None:
weights = weights * head_mask
context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head)
context = unshape(context) # (bs, qlen, dim)
outputs = (self.out_lin(context),)
if self.output_attentions:
outputs = outputs + (weights,)
return outputs
class TFTransformerFFN(tf.keras.layers.Layer):
def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs):
super(TFTransformerFFN, self).__init__(**kwargs)
self.lin1 = tf.keras.layers.Dense(dim_hidden, kernel_initializer=get_initializer(config.init_std), name='lin1')
self.lin2 = tf.keras.layers.Dense(out_dim, kernel_initializer=get_initializer(config.init_std), name='lin2')
self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu
self.dropout = tf.keras.layers.Dropout(config.dropout)
def call(self, input, training=False):
x = self.lin1(input)
x = self.act(x)
x = self.lin2(x)
x = self.dropout(x, training=training)
return x
class TFXLMMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFXLMMainLayer, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
# encoder / decoder, output layer
self.is_encoder = config.is_encoder
self.is_decoder = not config.is_encoder
if self.is_decoder:
raise NotImplementedError("Currently XLM can only be used as an encoder")
# self.with_output = with_output
self.causal = config.causal
# dictionary / languages
self.n_langs = config.n_langs
self.use_lang_emb = config.use_lang_emb
self.n_words = config.n_words
self.eos_index = config.eos_index
self.pad_index = config.pad_index
# self.dico = dico
# self.id2lang = config.id2lang
# self.lang2id = config.lang2id
# assert len(self.dico) == self.n_words
# assert len(self.id2lang) == len(self.lang2id) == self.n_langs
# model parameters
self.dim = config.emb_dim # 512 by default
self.hidden_dim = self.dim * 4 # 2048 by default
self.n_heads = config.n_heads # 8 by default
self.n_layers = config.n_layers
assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
# embeddings
self.dropout = tf.keras.layers.Dropout(config.dropout)
self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout)
self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings,
self.dim,
embeddings_initializer=get_initializer(config.embed_init_std),
name='position_embeddings')
if config.sinusoidal_embeddings:
raise NotImplementedError
# create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
if config.n_langs > 1 and config.use_lang_emb:
self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs,
self.dim,
embeddings_initializer=get_initializer(config.embed_init_std),
name='lang_embeddings')
self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, initializer_range=config.embed_init_std, name='embeddings') # padding_idx=self.pad_index)
self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm_emb')
# transformer layers
self.attentions = []
self.layer_norm1 = []
self.ffns = []
self.layer_norm2 = []
# if self.is_decoder:
# self.layer_norm15 = []
# self.encoder_attn = []
for i in range(self.n_layers):
self.attentions.append(TFMultiHeadAttention(self.n_heads, self.dim, config=config, name='attentions_._{}'.format(i)))
self.layer_norm1.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm1_._{}'.format(i)))
# if self.is_decoder:
# self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
# self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
self.ffns.append(TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name='ffns_._{}'.format(i)))
self.layer_norm2.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm2_._{}'.format(i)))
if hasattr(config, "pruned_heads"):
pruned_heads = config.pruned_heads.copy().items()
config.pruned_heads = {}
for layer, heads in pruned_heads:
if self.attentions[int(layer)].n_heads == config.n_heads:
self.prune_heads({int(layer): list(map(int, heads))})
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError
def _prune_heads(self, heads_to_prune):
""" Prunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
See base class PreTrainedModel
"""
raise NotImplementedError
def call(self, inputs, attention_mask=None, langs=None, token_type_ids=None,
position_ids=None, lengths=None, cache=None, head_mask=None,
training=False): # removed: src_enc=None, src_len=None
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
langs = inputs[2] if len(inputs) > 2 else langs
token_type_ids = inputs[3] if len(inputs) > 3 else token_type_ids
position_ids = inputs[4] if len(inputs) > 4 else position_ids
lengths = inputs[5] if len(inputs) > 5 else lengths
cache = inputs[6] if len(inputs) > 6 else cache
head_mask = inputs[7] if len(inputs) > 7 else head_mask
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
langs = inputs.get('langs', langs)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
position_ids = inputs.get('position_ids', position_ids)
lengths = inputs.get('lengths', lengths)
cache = inputs.get('cache', cache)
head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 8, "Too many inputs."
else:
input_ids = inputs
if lengths is None:
lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1)
# mask = input_ids != self.pad_index
# check inputs
bs, slen = shape_list(input_ids)
assert shape_list(lengths)[0] == bs
# assert lengths.max().item() <= slen
# input_ids = input_ids.transpose(0, 1) # batch size as dimension 0
# assert (src_enc is None) == (src_len is None)
# if src_enc is not None:
# assert self.is_decoder
# assert src_enc.size(0) == bs
# generate masks
mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
# if self.is_decoder and src_enc is not None:
# src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
# position_ids
if position_ids is None:
position_ids = tf.expand_dims(tf.range(slen), axis=0)
else:
assert shape_list(position_ids) == [bs, slen] # (slen, bs)
# position_ids = position_ids.transpose(0, 1)
# langs
if langs is not None:
assert shape_list(langs) == [bs, slen] # (slen, bs)
# langs = langs.transpose(0, 1)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
if head_mask is not None:
raise NotImplementedError
else:
head_mask = [None] * self.n_layers
# do not recompute cached elements
if cache is not None:
_slen = slen - cache['slen']
input_ids = input_ids[:, -_slen:]
position_ids = position_ids[:, -_slen:]
if langs is not None:
langs = langs[:, -_slen:]
mask = mask[:, -_slen:]
attn_mask = attn_mask[:, -_slen:]
# embeddings
tensor = self.embeddings(input_ids)
tensor = tensor + self.position_embeddings(position_ids)
if langs is not None and self.use_lang_emb:
tensor = tensor + self.lang_embeddings(langs)
if token_type_ids is not None:
tensor = tensor + self.embeddings(token_type_ids)
tensor = self.layer_norm_emb(tensor)
tensor = self.dropout(tensor, training=training)
tensor = tensor * mask[..., tf.newaxis]
# transformer layers
hidden_states = ()
attentions = ()
for i in range(self.n_layers):
if self.output_hidden_states:
hidden_states = hidden_states + (tensor,)
# self attention
attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training)
attn = attn_outputs[0]
if self.output_attentions:
attentions = attentions + (attn_outputs[1],)
attn = self.dropout(attn, training=training)
tensor = tensor + attn
tensor = self.layer_norm1[i](tensor)
# encoder attention (for decoder only)
# if self.is_decoder and src_enc is not None:
# attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
# attn = F.dropout(attn, p=self.dropout, training=self.training)
# tensor = tensor + attn
# tensor = self.layer_norm15[i](tensor)
# FFN
tensor = tensor + self.ffns[i](tensor)
tensor = self.layer_norm2[i](tensor)
tensor = tensor * mask[..., tf.newaxis]
# Add last hidden state
if self.output_hidden_states:
hidden_states = hidden_states + (tensor,)
# update cache length
if cache is not None:
cache['slen'] += tensor.size(1)
# move back sequence length to dimension 0
# tensor = tensor.transpose(0, 1)
outputs = (tensor,)
if self.output_hidden_states:
outputs = outputs + (hidden_states,)
if self.output_attentions:
outputs = outputs + (attentions,)
return outputs # outputs, (hidden_states), (attentions)
class TFXLMPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = XLMConfig
pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_xlm_pt_weights_in_tf2
base_model_prefix = "transformer"
XLM_START_DOCSTRING = r""" The XLM model was proposed in
`Cross-lingual Language Model Pretraining`_
by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
- a causal language modeling (CLM) objective (next token prediction),
- a masked language modeling (MLM) objective (Bert-like), or
- a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
Original code can be found `here`_.
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`Cross-lingual Language Model Pretraining`:
https://arxiv.org/abs/1901.07291
.. _`here`:
https://github.com/facebookresearch/XLM
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Note on the model inputs:
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
XLM_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
Indices can be obtained using :class:`transformers.XLMTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**langs**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens to be used to indicate the language of each token in the input.
Indices are languages ids which can be obtained from the language names by using two conversion mappings
provided in the configuration of the model (only provided for multilingual models).
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
**token_type_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
**position_ids**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, config.max_position_embeddings - 1]``.
**lengths**: (`optional`) ```Numpy array`` or ``tf.Tensor`` of shape ``(batch_size,)``:
Length of each sentence that can be used to avoid performing attention on padding token indices.
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
Indices selected in ``[0, ..., input_ids.size(-1)]``:
**cache**:
dictionary with ``Numpy array`` or ``tf.Tensor`` that contains pre-computed
hidden-states (key and values in the attention blocks) as computed by the model
(see `cache` output below). Can be used to speed up sequential decoding.
The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class TFXLMModel(TFXLMPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMModel
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMModel.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLMModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name='transformer')
def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs)
return outputs
class TFXLMPredLayer(tf.keras.layers.Layer):
"""
Prediction layer (cross_entropy or adaptive_softmax).
"""
def __init__(self, config, input_embeddings, **kwargs):
super(TFXLMPredLayer, self).__init__(**kwargs)
self.asm = config.asm
self.n_words = config.n_words
self.pad_index = config.pad_index
if config.asm is False:
self.input_embeddings = input_embeddings
else:
raise NotImplementedError
# self.proj = nn.AdaptiveLogSoftmaxWithLoss(
# in_features=dim,
# n_classes=config.n_words,
# cutoffs=config.asm_cutoffs,
# div_value=config.asm_div_value,
# head_bias=True, # default is False
# )
def build(self, input_shape):
# The output weights are the same as the input embeddings, but there is an output-only bias for each token.
self.bias = self.add_weight(shape=(self.n_words,),
initializer='zeros',
trainable=True,
name='bias')
super(TFXLMPredLayer, self).build(input_shape)
def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear")
hidden_states = hidden_states + self.bias
return hidden_states
@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class TFXLMWithLMHeadModel(TFXLMPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMWithLMHeadModel
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLMWithLMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name='transformer')
self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer_._proj')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
output = transformer_outputs[0]
outputs = self.pred_layer(output)
outputs = (outputs,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here
return outputs
@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class TFXLMForSequenceClassification(TFXLMPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForSequenceClassification
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
labels = tf.constant([1])[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLMForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFXLMMainLayer(config, name='transformer')
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name='sequence_summary')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
output = transformer_outputs[0]
logits = self.sequence_summary(output)
outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here
return outputs
@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
**end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
start_scores, end_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLMMainLayer(config, name='transformer')
self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.init_std),
name='qa_outputs')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
sequence_output = transformer_outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # start_logits, end_logits, (hidden_states), (attentions)
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TF 2.0 XLNet model.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import logging
import math
import os
import sys
from io import open
import numpy as np
import tensorflow as tf
from .configuration_xlnet import XLNetConfig
from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list, get_initializer
from .file_utils import add_start_docstrings
from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
logger = logging.getLogger(__name__)
TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = {
'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-tf_model.h5",
'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-tf_model.h5",
}
def load_xlnet_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
tf_inputs = tf.constant(inputs_list)
tfo = tf_model(tf_inputs, training=False) # build the network
return load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_inputs=tf_inputs)
def gelu(x):
""" Implementation of the gelu activation function.
XLNet is using OpenAI GPT's gelu
Also see https://arxiv.org/abs/1606.08415
"""
cdf = 0.5 * (1.0 + tf.tanh(
(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
def swish(x):
return x * tf.sigmoid(x)
ACT2FN = {"gelu": tf.keras.layers.Activation(gelu),
"relu": tf.keras.activations.relu,
"swish": tf.keras.layers.Activation(swish)}
class TFXLNetRelativeAttention(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFXLNetRelativeAttention, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
if config.d_model % config.n_head != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.d_model, config.n_head))
self.n_head = config.n_head
self.d_head = config.d_head
self.d_model = config.d_model
self.scale = 1 / (config.d_head ** 0.5)
self.initializer_range = config.initializer_range
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
self.dropout = tf.keras.layers.Dropout(config.dropout)
def build(self, input_shape):
initializer = get_initializer(self.initializer_range)
self.q = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='q')
self.k = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='k')
self.v = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='v')
self.o = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='o')
self.r = self.add_weight(shape=(self.d_model, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='r')
self.r_r_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True, name='r_r_bias')
self.r_s_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True, name='r_s_bias')
self.r_w_bias = self.add_weight(shape=(self.n_head, self.d_head),
initializer='zeros',
trainable=True, name='r_w_bias')
self.seg_embed = self.add_weight(shape=(2, self.n_head, self.d_head),
initializer=initializer,
trainable=True, name='seg_embed')
super(TFXLNetRelativeAttention, self).build(input_shape)
def prune_heads(self, heads):
raise NotImplementedError
@staticmethod
def rel_shift(x, klen=-1):
"""perform relative shift to form the relative attention score."""
x_size = shape_list(x)
x = tf.reshape(x, (x_size[1], x_size[0], x_size[2], x_size[3]))
x = x[1:, ...]
x = tf.reshape(x, (x_size[0], x_size[1] - 1, x_size[2], x_size[3]))
x = x[:, 0:klen, :, :]
# x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long))
return x
def rel_attn_core(self, inputs, training=False):
"""Core relative positional attention operations."""
q_head, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask, head_mask = inputs
# content based attention score
ac = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_w_bias, k_head_h)
# position based attention score
bd = tf.einsum('ibnd,jbnd->ijbn', q_head + self.r_r_bias, k_head_r)
bd = self.rel_shift(bd, klen=ac.shape[1])
# segment based attention score
if seg_mat is None:
ef = 0
else:
ef = tf.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed)
ef = tf.einsum('ijbs,ibns->ijbn', seg_mat, ef)
# merge attention scores and perform masking
attn_score = (ac + bd + ef) * self.scale
if attn_mask is not None:
# attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
if attn_mask.dtype == tf.float16:
attn_score = attn_score - 65500 * attn_mask
else:
attn_score = attn_score - 1e30 * attn_mask
# attention probability
attn_prob = tf.nn.softmax(attn_score, axis=1)
attn_prob = self.dropout(attn_prob, training=training)
# Mask heads if we want to
if head_mask is not None:
attn_prob = attn_prob * head_mask
# attention output
attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h)
if self.output_attentions:
return attn_vec, attn_prob
return attn_vec
def post_attention(self, inputs, residual=True, training=False):
"""Post-attention processing."""
# post-attention projection (back to `d_model`)
h, attn_vec = inputs
attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, self.o)
attn_out = self.dropout(attn_out, training=training)
if residual:
attn_out = attn_out + h
output = self.layer_norm(attn_out)
return output
def call(self, inputs, training=False):
(h, g, attn_mask_h, attn_mask_g,
r, seg_mat, mems, target_mapping, head_mask) = inputs
if g is not None:
###### Two-stream attention with relative positional encoding.
# content based attention score
if mems is not None and mems.shape.ndims > 1:
cat = tf.concat([mems, h], axis=0)
else:
cat = h
# content-based key head
k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k)
# content-based value head
v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v)
# position-based key head
k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r)
##### h-stream
# content-stream query head
q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q)
# core attention ops
attn_vec_h = self.rel_attn_core(
[q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask],
training=training)
if self.output_attentions:
attn_vec_h, attn_prob_h = attn_vec_h
# post processing
output_h = self.post_attention([h, attn_vec_h], training=training)
##### g-stream
# query-stream query head
q_head_g = tf.einsum('ibh,hnd->ibnd', g, self.q)
# core attention ops
if target_mapping is not None:
q_head_g = tf.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
attn_vec_g = self.rel_attn_core(
[q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask],
training=training)
if self.output_attentions:
attn_vec_g, attn_prob_g = attn_vec_g
attn_vec_g = tf.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping)
else:
attn_vec_g = self.rel_attn_core(
[q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask],
training=training)
if self.output_attentions:
attn_vec_g, attn_prob_g = attn_vec_g
# post processing
output_g = self.post_attention([g, attn_vec_g], training=training)
if self.output_attentions:
attn_prob = attn_prob_h, attn_prob_g
else:
###### Multi-head attention with relative positional encoding
if mems is not None and mems.shape.ndims > 1:
cat = tf.concat([mems, h], axis=0)
else:
cat = h
# content heads
q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q)
k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k)
v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v)
# positional heads
k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r)
# core attention ops
attn_vec = self.rel_attn_core(
[q_head_h, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_h, head_mask],
training=training)
if self.output_attentions:
attn_vec, attn_prob = attn_vec
# post processing
output_h = self.post_attention([h, attn_vec], training=training)
output_g = None
outputs = (output_h, output_g)
if self.output_attentions:
outputs = outputs + (attn_prob,)
return outputs
class TFXLNetFeedForward(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFXLNetFeedForward, self).__init__(**kwargs)
self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm')
self.layer_1 = tf.keras.layers.Dense(config.d_inner,
kernel_initializer=get_initializer(config.initializer_range),
name='layer_1')
self.layer_2 = tf.keras.layers.Dense(config.d_model,
kernel_initializer=get_initializer(config.initializer_range),
name='layer_2')
self.dropout = tf.keras.layers.Dropout(config.dropout)
if isinstance(config.ff_activation, str) or \
(sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)):
self.activation_function = ACT2FN[config.ff_activation]
else:
self.activation_function = config.ff_activation
def call(self, inp, training=False):
output = inp
output = self.layer_1(output)
output = self.activation_function(output)
output = self.dropout(output, training=training)
output = self.layer_2(output)
output = self.dropout(output, training=training)
output = self.layer_norm(output + inp)
return output
class TFXLNetLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFXLNetLayer, self).__init__(**kwargs)
self.rel_attn = TFXLNetRelativeAttention(config, name='rel_attn')
self.ff = TFXLNetFeedForward(config, name='ff')
self.dropout = tf.keras.layers.Dropout(config.dropout)
def call(self, inputs, training=False):
outputs = self.rel_attn(inputs, training=training)
output_h, output_g = outputs[:2]
if output_g is not None:
output_g = self.ff(output_g, training=training)
output_h = self.ff(output_h, training=training)
outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there
return outputs
class TFXLNetLMHead(tf.keras.layers.Layer):
def __init__(self, config, input_embeddings, **kwargs):
super(TFXLNetLMHead, self).__init__(**kwargs)
self.vocab_size = config.vocab_size
# The output weights are the same as the input embeddings, but there is
# an output-only bias for each token.
self.input_embeddings = input_embeddings
def build(self, input_shape):
self.bias = self.add_weight(shape=(self.vocab_size,),
initializer='zeros',
trainable=True,
name='bias')
super(TFXLNetLMHead, self).build(input_shape)
def call(self, hidden_states):
hidden_states = self.input_embeddings(hidden_states, mode="linear")
hidden_states = hidden_states + self.bias
return hidden_states
class TFXLNetMainLayer(tf.keras.layers.Layer):
def __init__(self, config, **kwargs):
super(TFXLNetMainLayer, self).__init__(**kwargs)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.mem_len = config.mem_len
self.reuse_len = config.reuse_len
self.d_model = config.d_model
self.same_length = config.same_length
self.attn_type = config.attn_type
self.bi_data = config.bi_data
self.clamp_len = config.clamp_len
self.n_layer = config.n_layer
self.use_bfloat16 = config.use_bfloat16
self.initializer_range = config.initializer_range
self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
self.dropout = tf.keras.layers.Dropout(config.dropout)
def build(self, input_shape):
initializer = get_initializer(self.initializer_range)
self.mask_emb = self.add_weight(shape=(1, 1, self.d_model),
initializer=initializer,
trainable=True, name='mask_emb')
def _resize_token_embeddings(self, new_num_tokens):
raise NotImplementedError
def _prune_heads(self, heads_to_prune):
raise NotImplementedError
def create_mask(self, qlen, mlen, dtype=tf.float32):
"""
Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
Args:
qlen: TODO Lysandre didn't fill
mlen: TODO Lysandre didn't fill
::
same_length=False: same_length=True:
<mlen > < qlen > <mlen > < qlen >
^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1]
[0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1]
qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1]
[0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1]
v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0]
"""
attn_mask = tf.ones([qlen, qlen], dtype=dtype)
mask_u = tf.matrix_band_part(attn_mask, 0, -1)
mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype)
ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
if self.same_length:
mask_l = tf.matrix_band_part(attn_mask, -1, 0)
ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
return ret
def cache_mem(self, curr_out, prev_mem):
"""cache hidden states into memory."""
if self.mem_len is None or self.mem_len == 0:
return None
else:
if self.reuse_len is not None and self.reuse_len > 0:
curr_out = curr_out[:self.reuse_len]
if prev_mem is None:
new_mem = curr_out[-self.mem_len:]
else:
new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len:]
return tf.stop_gradient(new_mem)
@staticmethod
def positional_embedding(pos_seq, inv_freq, bsz=None):
sinusoid_inp = tf.einsum('i,d->id', pos_seq, inv_freq)
pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], axis=-1)
pos_emb = pos_emb[:, None, :]
if bsz is not None:
pos_emb = tf.tile(pos_emb, [1, bsz, 1])
return pos_emb
def relative_positional_encoding(self, qlen, klen, bsz=None, dtype=None):
"""create relative positional encoding."""
freq_seq = tf.range(0, self.d_model, 2.0)
if dtype is not None and dtype != tf.float32:
freq_seq = tf.cast(freq_seq, dtype=dtype)
inv_freq = 1 / (10000 ** (freq_seq / self.d_model))
if self.attn_type == 'bi':
# beg, end = klen - 1, -qlen
beg, end = klen, -qlen
elif self.attn_type == 'uni':
# beg, end = klen - 1, -1
beg, end = klen, -1
else:
raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type))
if self.bi_data:
fwd_pos_seq = tf.range(beg, end, -1.0)
bwd_pos_seq = tf.range(-beg, -end, 1.0)
if dtype is not None and dtype != tf.float32:
fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype)
bwd_pos_seq = tf.cast(bwd_pos_seq, dtype=dtype)
if self.clamp_len > 0:
fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -self.clamp_len, self.clamp_len)
bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -self.clamp_len, self.clamp_len)
if bsz is not None:
# With bi_data, the batch size should be divisible by 2.
assert bsz%2 == 0
fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
else:
fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
pos_emb = tf.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
else:
fwd_pos_seq = tf.range(beg, end, -1.0)
if dtype is not None and dtype != tf.float32:
fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype)
if self.clamp_len > 0:
fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -clamp_len, clamp_len)
pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
return pos_emb
def call(self, inputs, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None, training=False):
if isinstance(inputs, (tuple, list)):
input_ids = inputs[0]
attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
mems = inputs[2] if len(inputs) > 2 else mems
perm_mask = inputs[3] if len(inputs) > 3 else perm_mask
target_mapping = inputs[4] if len(inputs) > 4 else target_mapping
token_type_ids = inputs[5] if len(inputs) > 5 else token_type_ids
input_mask = inputs[6] if len(inputs) > 6 else input_mask
head_mask = inputs[7] if len(inputs) > 7 else head_mask
assert len(inputs) <= 8, "Too many inputs."
elif isinstance(inputs, dict):
input_ids = inputs.get('input_ids')
attention_mask = inputs.get('attention_mask', attention_mask)
mems = inputs.get('mems', mems)
perm_mask = inputs.get('perm_mask', perm_mask)
target_mapping = inputs.get('target_mapping', target_mapping)
token_type_ids = inputs.get('token_type_ids', token_type_ids)
input_mask = inputs.get('input_mask', input_mask)
head_mask = inputs.get('head_mask', head_mask)
assert len(inputs) <= 8, "Too many inputs."
else:
input_ids = inputs
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension
# so we move here the first dimension (batch) to the end
input_ids = tf.transpose(input_ids, perm=(1, 0))
token_type_ids = tf.transpose(token_type_ids, perm=(1, 0)) if token_type_ids is not None else None
input_mask = tf.transpose(input_mask, perm=(1, 0)) if input_mask is not None else None
attention_mask = tf.transpose(attention_mask, perm=(1, 0)) if attention_mask is not None else None
perm_mask = tf.transpose(perm_mask, perm=(1, 2, 0)) if perm_mask is not None else None
target_mapping = tf.transpose(target_mapping, perm=(1, 2, 0)) if target_mapping is not None else None
qlen, bsz = shape_list(input_ids)[:2]
mlen = shape_list(mems[0])[0] if mems is not None and mems[0] is not None else 0
klen = mlen + qlen
dtype_float = tf.bfloat16 if self.use_bfloat16 else tf.float32
##### Attention mask
# causal attention mask
if self.attn_type == 'uni':
attn_mask = self.create_mask(qlen, mlen)
attn_mask = attn_mask[:, :, None, None]
elif self.attn_type == 'bi':
attn_mask = None
else:
raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
# data mask: input mask & perm mask
assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
"or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one."
if input_mask is None and attention_mask is not None:
input_mask = 1.0 - attention_mask
if input_mask is not None and perm_mask is not None:
data_mask = input_mask[None] + perm_mask
elif input_mask is not None and perm_mask is None:
data_mask = input_mask[None]
elif input_mask is None and perm_mask is not None:
data_mask = perm_mask
else:
data_mask = None
if data_mask is not None:
# all mems can be attended to
mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz],
dtype=dtype_float)
data_mask = tf.concat([mems_mask, data_mask], axis=1)
if attn_mask is None:
attn_mask = data_mask[:, :, :, None]
else:
attn_mask += data_mask[:, :, :, None]
if attn_mask is not None:
attn_mask = tf.cast(attn_mask > 0, dtype=dtype_float)
if attn_mask is not None:
non_tgt_mask = -tf.eye(qlen, dtype=dtype_float)
non_tgt_mask = tf.concat([tf.zeros([qlen, mlen], dtype=dtype_float), non_tgt_mask], axis=-1)
non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0, dtype=dtype_float)
else:
non_tgt_mask = None
##### Word embeddings and prepare h & g hidden states
word_emb_k = self.word_embedding(input_ids)
output_h = self.dropout(word_emb_k, training=training)
if target_mapping is not None:
word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1])
# else: # We removed the inp_q input which was same as target mapping
# inp_q_ext = inp_q[:, :, None]
# word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
output_g = self.dropout(word_emb_q, training=training)
else:
output_g = None
##### Segment embedding
if token_type_ids is not None:
# Convert `token_type_ids` to one-hot `seg_mat`
mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32)
cat_ids = tf.concat([mem_pad, token_type_ids], 0)
# `1` indicates not in the same segment [qlen x klen x bsz]
seg_mat = tf.cast(
tf.logical_not(tf.equal(token_type_ids[:, None], cat_ids[None, :])),
tf.int32)
seg_mat = tf.one_hot(seg_mat, 2, dtype=dtype_float)
else:
seg_mat = None
##### Positional encoding
pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz, dtype=dtype_float)
pos_emb = self.dropout(pos_emb, training=training)
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
# and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
if head_mask is not None:
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
elif head_mask.dim() == 2:
head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
else:
head_mask = [None] * self.n_layer
new_mems = ()
if mems is None:
mems = [None] * len(self.layer)
attentions = []
hidden_states = []
for i, layer_module in enumerate(self.layer):
# cache new mems
new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
if self.output_hidden_states:
hidden_states.append((output_h, output_g) if output_g is not None else output_h)
outputs = layer_module([output_h, output_g, non_tgt_mask, attn_mask,
pos_emb, seg_mat, mems[i], target_mapping,
head_mask[i]], training=training)
output_h, output_g = outputs[:2]
if self.output_attentions:
attentions.append(outputs[2])
# Add last hidden state
if self.output_hidden_states:
hidden_states.append((output_h, output_g) if output_g is not None else output_h)
output = self.dropout(output_g if output_g is not None else output_h, training=training)
# Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
outputs = (tf.transpose(output, perm=(1, 0, 2)), new_mems)
if self.output_hidden_states:
if output_g is not None:
hidden_states = tuple(tf.transpose(h, perm=(1, 0, 2)) for hs in hidden_states for h in hs)
else:
hidden_states = tuple(tf.transpose(hs, perm=(1, 0, 2)) for hs in hidden_states)
outputs = outputs + (hidden_states,)
if self.output_attentions:
attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
outputs = outputs + (attentions,)
return outputs # outputs, new_mems, (hidden_states), (attentions)
class TFXLNetPreTrainedModel(TFPreTrainedModel):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
config_class = XLNetConfig
pretrained_model_archive_map = TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
load_pt_weights = load_xlnet_pt_weights_in_tf2
base_model_prefix = "transformer"
XLNET_START_DOCSTRING = r""" The XLNet model was proposed in
`XLNet: Generalized Autoregressive Pretraining for Language Understanding`_
by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method
to learn bidirectional contexts by maximizing the expected likelihood over all permutations
of the input sequence factorization order.
The specific attention pattern can be controlled at training and test time using the `perm_mask` input.
Do to the difficulty of training a fully auto-regressive model over various factorization order,
XLNet is pretrained using only a sub-set of the output tokens as target which are selected
with the `target_mapping` input.
To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and
`target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`)
This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
refer to the TF 2.0 documentation for all matter related to general usage and behavior.
.. _`XLNet: Generalized Autoregressive Pretraining for Language Understanding`:
http://arxiv.org/abs/1906.08237
.. _`tf.keras.Model`:
https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
Note on the model inputs:
TF 2.0 models accepts two formats as inputs:
- having all inputs as keyword arguments (like PyTorch models), or
- having all inputs as a list, tuple or dict in the first positional arguments.
This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
- a single Tensor with input_ids only and nothing else: `model(inputs_ids)
- a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
`model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
- a dictionary with one or several input Tensors associaed to the input names given in the docstring:
`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
Parameters:
config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
XLNET_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
XLNet is a model with relative position embeddings so you can either pad the inputs on
the right or on the left.
Indices can be obtained using :class:`transformers.XLNetTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**mems**: (`optional`)
list of ``Numpy array`` or ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
To activate mems you need to set up config.mem_len to a positive value which will be the max number of tokens in
the memory output by the model. E.g. `model = XLNetModel.from_pretrained('xlnet-base-case, mem_len=1024)` will
instantiate a model which can use up to 1024 tokens of memory (in addition to the input it self).
**perm_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, sequence_length)``:
Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``:
If ``perm_mask[k, i, j] = 0``, i attend to j in batch k;
if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k.
If None, each token attends to all the others (full bidirectional attention).
Only used during pretraining (to define factorization order) or for sequential decoding (generation).
**target_mapping**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, num_predict, sequence_length)``:
Mask to indicate the output tokens to use.
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
Only used during pretraining for partial prediction or for sequential decoding (generation).
**token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
the important thing is that they should be different for tokens which belong to different segments.
The model will compute relative segment differences from the given type indices:
0 if the segment id of two tokens are the same, 1 if not.
**input_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
Kept for compatibility with the original code base.
You can only uses one of `input_mask` and `attention_mask`
Mask values selected in ``[0, 1]``:
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
**head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare XLNet Model transformer outputing raw hidden-states without any specific head on top.",
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class TFXLNetModel(TFXLNetPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**mems**:
list of ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetModel
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetModel.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLNetModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name='transformer')
def call(self, inputs, **kwargs):
outputs = self.transformer(inputs, **kwargs)
return outputs
@add_start_docstrings("""XLNet Model with a language modeling head on top
(linear layer with weights tied to the input embeddings). """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**prediction_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
**mems**:
list of ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetLMHeadModel
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>"))[None, :] # We will predict the masked token
perm_mask = tf.zeros((1, input_ids.shape[1], input_ids.shape[1]))
perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token
target_mapping = tf.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name='transformer')
self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
hidden_state = transformer_outputs[0]
logits = self.lm_loss(hidden_state)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # return logits, mems, (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of
the pooled output) e.g. for GLUE tasks. """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**logits**: ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax).
**mems**:
list of ``tf.Tensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForSequenceClassification
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLNetForSequenceClassification, self).__init__(config, *inputs, **kwargs)
self.num_labels = config.num_labels
self.transformer = TFXLNetMainLayer(config, name='transformer')
self.sequence_summary = TFSequenceSummary(config, initializer_range=config.initializer_range, name='sequence_summary')
self.logits_proj = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='logits_proj')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
output = transformer_outputs[0]
output = self.sequence_summary(output)
logits = self.logits_proj(output)
outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # return logits, mems, (hidden states), (attentions)
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
# the hidden-states output to compute `span start logits` and `span end logits`). """,
# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
# class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel):
class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel):
r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**start_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
**end_scores**: ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
import tensorflow as tf
from transformers import XLNetTokenizer, TFXLNetForQuestionAnsweringSimple
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
outputs = model(input_ids)
start_scores, end_scores = outputs[:2]
"""
def __init__(self, config, *inputs, **kwargs):
super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs)
self.transformer = TFXLNetMainLayer(config, name='transformer')
self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
kernel_initializer=get_initializer(config.initializer_range),
name='qa_outputs')
def call(self, inputs, **kwargs):
transformer_outputs = self.transformer(inputs, **kwargs)
sequence_output = transformer_outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = tf.split(logits, 2, axis=-1)
start_logits = tf.squeeze(start_logits, axis=-1)
end_logits = tf.squeeze(end_logits, axis=-1)
outputs = (start_logits, end_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
return outputs # start_logits, end_logits, (hidden_states), (attentions)
# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
# the hidden-states output to compute `span start logits` and `span end logits`). """,
# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
# class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel):
# r"""
# Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
# **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
# ``tf.Tensor`` of shape ``(batch_size, config.start_n_top)``
# Log probabilities for the top config.start_n_top start token possibilities (beam-search).
# **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
# ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
# Indices for the top config.start_n_top start token possibilities (beam-search).
# **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
# ``tf.Tensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
# Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
# **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
# ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
# Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
# **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
# ``tf.Tensor`` of shape ``(batch_size,)``
# Log probabilities for the ``is_impossible`` label of the answers.
# **mems**:
# list of ``tf.Tensor`` (one for each layer):
# that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
# if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
# See details in the docstring of the `mems` input above.
# **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
# list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
# of shape ``(batch_size, sequence_length, hidden_size)``:
# Hidden-states of the model at the output of each layer plus the initial embedding outputs.
# **attentions**: (`optional`, returned when ``config.output_attentions=True``)
# list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
# Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
# Examples::
# tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
# model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
# input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1
# start_positions = tf.constant([1])
# end_positions = tf.constant([3])
# outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
# loss, start_scores, end_scores = outputs[:2]
# """
# def __init__(self, config, *inputs, **kwargs):
# super(TFXLNetForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
# self.start_n_top = config.start_n_top
# self.end_n_top = config.end_n_top
# self.transformer = TFXLNetMainLayer(config, name='transformer')
# self.start_logits = TFPoolerStartLogits(config, name='start_logits')
# self.end_logits = TFPoolerEndLogits(config, name='end_logits')
# self.answer_class = TFPoolerAnswerClass(config, name='answer_class')
# def call(self, inputs, training=False):
# transformer_outputs = self.transformer(inputs, training=training)
# hidden_states = transformer_outputs[0]
# start_logits = self.start_logits(hidden_states, p_mask=p_mask)
# outputs = transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it
# if start_positions is not None and end_positions is not None:
# # If we are on multi-GPU, let's remove the dimension added by batch splitting
# for x in (start_positions, end_positions, cls_index, is_impossible):
# if x is not None and x.dim() > 1:
# x.squeeze_(-1)
# # during training, compute the end logits based on the ground truth of the start position
# end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
# loss_fct = CrossEntropyLoss()
# start_loss = loss_fct(start_logits, start_positions)
# end_loss = loss_fct(end_logits, end_positions)
# total_loss = (start_loss + end_loss) / 2
# if cls_index is not None and is_impossible is not None:
# # Predict answerability from the representation of CLS and START
# cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
# loss_fct_cls = nn.BCEWithLogitsLoss()
# cls_loss = loss_fct_cls(cls_logits, is_impossible)
# # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
# total_loss += cls_loss * 0.5
# outputs = (total_loss,) + outputs
# else:
# # during inference, compute the end logits based on beam search
# bsz, slen, hsz = hidden_states.size()
# start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
# start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
# start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
# start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
# start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
# hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
# p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
# end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
# end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
# end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
# end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
# end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
# start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) # get the representation of START as weighted sum of hidden states
# cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) # Shape (batch size,): one single `cls_logits` for each sample
# outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
# # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
# # or (if labels are provided) (total_loss,)
# return outputs
......@@ -194,7 +194,7 @@ class PositionalEmbedding(nn.Module):
class PositionwiseFF(nn.Module):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False):
def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5):
super(PositionwiseFF, self).__init__()
self.d_model = d_model
......@@ -208,7 +208,7 @@ class PositionwiseFF(nn.Module):
nn.Dropout(dropout),
)
self.layer_norm = nn.LayerNorm(d_model)
self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
self.pre_lnorm = pre_lnorm
......@@ -229,102 +229,12 @@ class PositionwiseFF(nn.Module):
return output
class MultiHeadAttn(nn.Module):
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
pre_lnorm=False, r_r_bias=None, r_w_bias=None, output_attentions=False):
super(MultiHeadAttn, self).__init__()
self.output_attentions = output_attentions
self.n_head = n_head
self.d_model = d_model
self.d_head = d_head
self.dropout = dropout
self.q_net = nn.Linear(d_model, n_head * d_head, bias=False)
self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False)
self.drop = nn.Dropout(dropout)
self.dropatt = nn.Dropout(dropatt)
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
self.layer_norm = nn.LayerNorm(d_model)
self.scale = 1 / (d_head ** 0.5)
self.pre_lnorm = pre_lnorm
if r_r_bias is None or r_w_bias is None: # Biases are not shared
self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
else:
self.r_r_bias = r_r_bias
self.r_w_bias = r_w_bias
def forward(self, h, attn_mask=None, mems=None, head_mask=None):
##### multihead attention
# [hlen x bsz x n_head x d_head]
if mems is not None:
c = torch.cat([mems, h], 0)
else:
c = h
if self.pre_lnorm:
##### layer normalization
c = self.layer_norm(c)
head_q = self.q_net(h)
head_k, head_v = torch.chunk(self.kv_net(c), 2, -1)
head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head)
head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head)
head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head)
# [qlen x klen x bsz x n_head]
attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k))
attn_score.mul_(self.scale)
if attn_mask is not None and torch.sum(attn_mask).item():
attn_mask = (attn_mask == 1) # Switch to bool
if attn_mask.dim() == 2:
attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
elif attn_mask.dim() == 3:
attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
# [qlen x klen x bsz x n_head]
attn_prob = F.softmax(attn_score, dim=1)
attn_prob = self.dropatt(attn_prob)
# Mask heads if we want to
if head_mask is not None:
attn_prob = attn_prob * head_mask
# [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head]
attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v))
attn_vec = attn_vec.contiguous().view(
attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
##### linear projection
attn_out = self.o_net(attn_vec)
attn_out = self.drop(attn_out)
if self.pre_lnorm:
##### residual connection
outputs = [h + attn_out]
else:
##### residual connection + layer normalization
outputs = [self.layer_norm(h + attn_out)]
if self.output_attentions:
outputs.append(attn_prob)
return outputs
class RelMultiHeadAttn(nn.Module):
class RelPartialLearnableMultiHeadAttn(nn.Module):
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
r_r_bias=None, r_w_bias=None, output_attentions=False):
super(RelMultiHeadAttn, self).__init__()
r_r_bias=None, r_w_bias=None, output_attentions=False,
layer_norm_epsilon=1e-5):
super(RelPartialLearnableMultiHeadAttn, self).__init__()
self.output_attentions = output_attentions
self.n_head = n_head
......@@ -338,7 +248,7 @@ class RelMultiHeadAttn(nn.Module):
self.dropatt = nn.Dropout(dropatt)
self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
self.layer_norm = nn.LayerNorm(d_model)
self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
self.scale = 1 / (d_head ** 0.5)
......@@ -351,36 +261,9 @@ class RelMultiHeadAttn(nn.Module):
self.r_r_bias = r_r_bias
self.r_w_bias = r_w_bias
def _parallelogram_mask(self, h, w, left=False):
mask = torch.ones((h, w)).byte()
m = min(h, w)
mask[:m,:m] = torch.triu(mask[:m,:m])
mask[-m:,-m:] = torch.tril(mask[-m:,-m:])
if left:
return mask
else:
return mask.flip(0)
def _shift(self, x, qlen, klen, mask, left=False):
if qlen > 1:
zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)),
device=x.device, dtype=x.dtype)
else:
zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype)
if left:
mask = mask.flip(1)
x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1)
else:
x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1)
x = x_padded.masked_select(mask[:,:,None,None]) \
.view(qlen, klen, x.size(2), x.size(3))
return x
self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
def _rel_shift(self, x, zero_triu=False):
def _rel_shift(self, x):
zero_pad_shape = (x.size(0), 1) + x.size()[2:]
zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
x_padded = torch.cat([zero_pad, x], dim=1)
......@@ -390,21 +273,8 @@ class RelMultiHeadAttn(nn.Module):
x = x_padded[1:].view_as(x)
if zero_triu:
ones = torch.ones((x.size(0), x.size(1)))
x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None]
return x
def forward(self, w, r, attn_mask=None, mems=None):
raise NotImplementedError
class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
def __init__(self, *args, **kwargs):
super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
def forward(self, w, r, attn_mask=None, mems=None, head_mask=None):
qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
......@@ -496,148 +366,17 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
return outputs
class RelLearnableMultiHeadAttn(RelMultiHeadAttn):
def __init__(self, *args, **kwargs):
super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs)
def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None, head_mask=None):
# r_emb: [klen, n_head, d_head], used for term B
# r_w_bias: [n_head, d_head], used for term C
# r_bias: [klen, n_head], used for term D
qlen, bsz = w.size(0), w.size(1)
if mems is not None:
cat = torch.cat([mems, w], 0)
if self.pre_lnorm:
w_heads = self.qkv_net(self.layer_norm(cat))
else:
w_heads = self.qkv_net(cat)
w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
w_head_q = w_head_q[-qlen:]
else:
if self.pre_lnorm:
w_heads = self.qkv_net(self.layer_norm(w))
else:
w_heads = self.qkv_net(w)
w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
klen = w_head_k.size(0)
w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)
w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)
w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)
if klen > r_emb.size(0):
r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1)
r_emb = torch.cat([r_emb_pad, r_emb], 0)
r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1)
r_bias = torch.cat([r_bias_pad, r_bias], 0)
else:
r_emb = r_emb[-klen:]
r_bias = r_bias[-klen:]
#### compute attention score
rw_head_q = w_head_q + r_w_bias[None] # qlen x bsz x n_head x d_head
AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head
B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb)) # qlen x klen x bsz x n_head
D_ = r_bias[None, :, None] # 1 x klen x 1 x n_head
BD = self._rel_shift(B_ + D_)
# [qlen x klen x bsz x n_head]
attn_score = AC + BD
attn_score.mul_(self.scale)
#### compute attention probability
if attn_mask is not None and torch.sum(attn_mask).item():
attn_mask = (attn_mask == 1) # Switch to bool
if attn_mask.dim() == 2:
attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf'))
elif attn_mask.dim() == 3:
attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf'))
# [qlen x klen x bsz x n_head]
attn_prob = F.softmax(attn_score, dim=1)
attn_prob = self.dropatt(attn_prob)
if head_mask is not None:
attn_prob = attn_prob * head_mask
#### compute attention vector
attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v))
# [qlen x bsz x n_head x d_head]
attn_vec = attn_vec.contiguous().view(
attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
##### linear projection
attn_out = self.o_net(attn_vec)
attn_out = self.drop(attn_out)
if self.pre_lnorm:
##### residual connection
outputs = [w + attn_out]
else:
##### residual connection + layer normalization
outputs = [self.layer_norm(w + attn_out)]
if self.output_attentions:
outputs.append(attn_prob)
return outputs
class DecoderLayer(nn.Module):
def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs):
super(DecoderLayer, self).__init__()
self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm'))
def forward(self, dec_inp, dec_attn_mask=None, mems=None, head_mask=None):
attn_outputs = self.dec_attn(dec_inp, attn_mask=dec_attn_mask,
mems=mems, head_mask=head_mask)
ff_output = self.pos_ff(attn_outputs[0])
outputs = [ff_output] + attn_outputs[1:]
return outputs
class RelLearnableDecoderLayer(nn.Module):
def __init__(self, n_head, d_model, d_head, d_inner, dropout,
**kwargs):
super(RelLearnableDecoderLayer, self).__init__()
self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout,
**kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm'))
def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None, head_mask=None):
attn_outputs = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias,
attn_mask=dec_attn_mask,
mems=mems, head_mask=head_mask)
ff_output = self.pos_ff(attn_outputs[0])
outputs = [ff_output] + attn_outputs[1:]
return outputs
class RelPartialLearnableDecoderLayer(nn.Module):
def __init__(self, n_head, d_model, d_head, d_inner, dropout,
def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5,
**kwargs):
super(RelPartialLearnableDecoderLayer, self).__init__()
self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model,
d_head, dropout, **kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm'))
d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs)
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm'),
layer_norm_epsilon=layer_norm_epsilon)
def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None):
......@@ -651,7 +390,6 @@ class RelPartialLearnableDecoderLayer(nn.Module):
return outputs
class AdaptiveEmbedding(nn.Module):
def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
sample_softmax=False):
......@@ -775,9 +513,6 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
if hasattr(m, 'r_bias'):
self._init_bias(m.r_bias)
def set_num_special_tokens(self, num_special_tokens):
pass
TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in
`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
......@@ -796,9 +531,9 @@ TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in
https://pytorch.org/docs/stable/nn.html#module
Parameters:
config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
TRANSFO_XL_INPUTS_DOCSTRING = r"""
......@@ -807,9 +542,9 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
Indices of input sequence tokens in the vocabulary.
Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
the right or on the left.
Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
Indices can be obtained using :class:`transformers.TransfoXLTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mems**: (`optional`)
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
......@@ -888,45 +623,19 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
r_w_bias=None if config.untie_r else self.r_w_bias,
r_r_bias=None if config.untie_r else self.r_r_bias,
output_attentions=self.output_attentions)
)
elif config.attn_type == 1: # learnable embeddings
for i in range(config.n_layer):
self.layers.append(
RelLearnableDecoderLayer(
config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len,
dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
r_w_bias=None if config.untie_r else self.r_w_bias,
r_r_bias=None if config.untie_r else self.r_r_bias,
output_attentions=self.output_attentions)
)
elif config.attn_type in [2, 3]: # absolute embeddings
for i in range(config.n_layer):
self.layers.append(
DecoderLayer(
config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout,
dropatt=config.dropatt, pre_lnorm=config.pre_lnorm,
r_w_bias=None if config.untie_r else self.r_w_bias,
r_r_bias=None if config.untie_r else self.r_r_bias,
output_attentions=self.output_attentions)
output_attentions=self.output_attentions,
layer_norm_epsilon=config.layer_norm_epsilon)
)
else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints
raise NotImplementedError # Removed them to avoid maintaining dead code
self.same_length = config.same_length
self.clamp_len = config.clamp_len
if self.attn_type == 0: # default attention
self.pos_emb = PositionalEmbedding(self.d_model)
elif self.attn_type == 1: # learnable
self.r_emb = nn.Parameter(torch.FloatTensor(
self.n_layer, self.max_klen, self.n_head, self.d_head))
self.r_bias = nn.Parameter(torch.FloatTensor(
self.n_layer, self.max_klen, self.n_head))
elif self.attn_type == 2: # absolute standard
self.pos_emb = PositionalEmbedding(self.d_model)
elif self.attn_type == 3: # absolute deeper SA
self.r_emb = nn.Parameter(torch.FloatTensor(
self.n_layer, self.max_klen, self.n_head, self.d_head))
else: # learnable embeddings and absolute embeddings
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
self.init_weights()
......@@ -981,8 +690,15 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return new_mems
def _forward(self, dec_inp, mems=None, head_mask=None):
qlen, bsz = dec_inp.size()
def forward(self, input_ids, mems=None, head_mask=None):
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = input_ids.transpose(0, 1).contiguous()
if mems is None:
mems = self.init_mems(input_ids)
qlen, bsz = input_ids.size()
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
......@@ -999,7 +715,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
else:
head_mask = [None] * self.n_layer
word_emb = self.word_emb(dec_inp)
word_emb = self.word_emb(input_ids)
mlen = mems[0].size(0) if mems is not None else 0
klen = mlen + qlen
......@@ -1036,64 +752,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
core_out = layer_outputs[0]
if self.output_attentions:
attentions.append(layer_outputs[1])
elif self.attn_type == 1: # learnable
core_out = self.drop(word_emb)
for i, layer in enumerate(self.layers):
hids.append(core_out)
if self.clamp_len > 0:
r_emb = self.r_emb[i][-self.clamp_len :]
r_bias = self.r_bias[i][-self.clamp_len :]
else:
r_emb, r_bias = self.r_emb[i], self.r_bias[i]
mems_i = None if mems is None else mems[i]
layer_outputs = layer(core_out, r_emb, self.r_w_bias[i],
r_bias, dec_attn_mask=dec_attn_mask,
mems=mems_i, head_mask=head_mask[i])
core_out = layer_outputs[0]
if self.output_attentions:
attentions.append(layer_outputs[1])
elif self.attn_type == 2: # absolute
pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
dtype=word_emb.dtype)
if self.clamp_len > 0:
pos_seq.clamp_(max=self.clamp_len)
pos_emb = self.pos_emb(pos_seq)
core_out = self.drop(word_emb + pos_emb[-qlen:])
for i, layer in enumerate(self.layers):
hids.append(core_out)
mems_i = None if mems is None else mems[i]
if mems_i is not None and i == 0:
mems_i += pos_emb[:mlen]
layer_outputs = layer(core_out, dec_attn_mask=dec_attn_mask,
mems=mems_i, head_mask=head_mask[i])
core_out = layer_outputs[0]
if self.output_attentions:
attentions.append(layer_outputs[1])
elif self.attn_type == 3:
core_out = self.drop(word_emb)
for i, layer in enumerate(self.layers):
hids.append(core_out)
mems_i = None if mems is None else mems[i]
if mems_i is not None and mlen > 0:
cur_emb = self.r_emb[i][:-qlen]
cur_size = cur_emb.size(0)
if cur_size < mlen:
cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1)
cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)
else:
cur_emb = cur_emb[-mlen:]
mems_i += cur_emb.view(mlen, 1, -1)
core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)
layer_outputs = layer(core_out, dec_attn_mask=dec_attn_mask,
mems=mems_i, head_mask=head_mask[i])
core_out = layer_outputs[0]
if self.output_attentions:
attentions.append(layer_outputs[1])
else: # learnable embeddings and absolute embeddings
raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
core_out = self.drop(core_out)
......@@ -1110,16 +770,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
# Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
outputs.append(attentions)
return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
def forward(self, input_ids, mems=None, head_mask=None):
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = input_ids.transpose(0, 1).contiguous()
if mems is None:
mems = self.init_mems(input_ids)
outputs = self._forward(input_ids, mems=mems, head_mask=head_mask)
return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
......
......@@ -31,7 +31,7 @@ from torch.nn import CrossEntropyLoss
from torch.nn import functional as F
from .configuration_utils import PretrainedConfig
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME
from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
logger = logging.getLogger(__name__)
......@@ -52,16 +52,16 @@ except ImportError:
class PreTrainedModel(nn.Module):
r""" Base class for all models.
:class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
:class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
Class attributes (overridden by derived classes):
- ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
- ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
- ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
- ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
- ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
- ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
- ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
- ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
- ``path``: a path (string) to the TensorFlow checkpoint.
- ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
......@@ -189,7 +189,7 @@ class PreTrainedModel(nn.Module):
def save_pretrained(self, save_directory):
""" Save a model and its configuration file to a directory, so that it
can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
"""
assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
......@@ -201,8 +201,8 @@ class PreTrainedModel(nn.Module):
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
logger.info("Model weights saved in {}".format(output_model_file))
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
......@@ -220,23 +220,24 @@ class PreTrainedModel(nn.Module):
pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
- a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
- None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
model_args: (`optional`) Sequence of positional arguments:
All remaning positional arguments will be passed to the underlying model's ``__init__`` method
config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
- the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
- the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
- the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
state_dict: (`optional`) dict:
an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
This option can be used if you want to create a model from a pretrained configuration but load your own weights.
In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model
......@@ -256,7 +257,7 @@ class PreTrainedModel(nn.Module):
Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
- If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
- If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
Examples::
......@@ -289,103 +290,125 @@ class PreTrainedModel(nn.Module):
model_kwargs = kwargs
# Load model
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path):
if from_tf:
# Directly load from a TensorFlow checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
if pretrained_model_name_or_path is not None:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
elif os.path.isdir(pretrained_model_name_or_path):
if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
# Load from a TF 1.0 checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
# Load from a TF 2.0 checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
# Load from a PyTorch checkpoint
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
else:
raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
[WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
pretrained_model_name_or_path))
elif os.path.isfile(pretrained_model_name_or_path):
archive_file = pretrained_model_name_or_path
else:
archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
else:
if from_tf:
# Directly load from a TensorFlow checkpoint
assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
archive_file = pretrained_model_name_or_path + ".index"
# redirect to the cache, if necessary
try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError as e:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
logger.error(
"Couldn't reach server at '{}' to download pretrained weights.".format(
archive_file))
else:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(cls.pretrained_model_archive_map.keys()),
archive_file))
raise e
if resolved_archive_file == archive_file:
logger.info("loading weights file {}".format(archive_file))
else:
archive_file = pretrained_model_name_or_path
# redirect to the cache, if necessary
try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
except EnvironmentError as e:
if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
logger.error(
"Couldn't reach server at '{}' to download pretrained weights.".format(
archive_file))
else:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name_or_path,
', '.join(cls.pretrained_model_archive_map.keys()),
archive_file))
raise e
if resolved_archive_file == archive_file:
logger.info("loading weights file {}".format(archive_file))
logger.info("loading weights file {} from cache at {}".format(
archive_file, resolved_archive_file))
else:
logger.info("loading weights file {} from cache at {}".format(
archive_file, resolved_archive_file))
resolved_archive_file = None
# Instantiate model.
model = cls(config, *model_args, **model_kwargs)
if state_dict is None and not from_tf:
state_dict = torch.load(resolved_archive_file, map_location='cpu')
if from_tf:
# Directly load from a TensorFlow checkpoint
return cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index'
# Convert old format to new format if needed from a PyTorch state_dict
old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
# Load from a PyTorch state_dict
missing_keys = []
unexpected_keys = []
error_msgs = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata
def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')
# Make sure we are able to load base models as well as derived models (with heads)
start_prefix = ''
model_to_load = model
if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
start_prefix = cls.base_model_prefix + '.'
if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
model_to_load = getattr(model, cls.base_model_prefix)
load(model_to_load, prefix=start_prefix)
if len(missing_keys) > 0:
logger.info("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
logger.info("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
if len(error_msgs) > 0:
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
model.__class__.__name__, "\n\t".join(error_msgs)))
if from_tf:
if resolved_archive_file.endswith('.index'):
# Load from a TensorFlow 1.X checkpoint - provided by original authors
model = cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index'
else:
# Load from our TensorFlow 2.0 checkpoints
try:
from transformers import load_tf2_checkpoint_in_pytorch_model
model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
except ImportError as e:
logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
"https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.")
raise e
else:
# Convert old format to new format if needed from a PyTorch state_dict
old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata
def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')
# Make sure we are able to load base models as well as derived models (with heads)
start_prefix = ''
model_to_load = model
if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
start_prefix = cls.base_model_prefix + '.'
if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
model_to_load = getattr(model, cls.base_model_prefix)
load(model_to_load, prefix=start_prefix)
if len(missing_keys) > 0:
logger.info("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
logger.info("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
if len(error_msgs) > 0:
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
model.__class__.__name__, "\n\t".join(error_msgs)))
if hasattr(model, 'tie_weights'):
model.tie_weights() # make sure word embedding weights are still tied
......@@ -478,7 +501,10 @@ class PoolerEndLogits(nn.Module):
x = self.dense_1(x).squeeze(-1)
if p_mask is not None:
x = x * (1 - p_mask) - 1e30 * p_mask
if next(self.parameters()).dtype == torch.float16:
x = x * (1 - p_mask) - 65500 * p_mask
else:
x = x * (1 - p_mask) - 1e30 * p_mask
return x
......@@ -531,7 +557,7 @@ class SQuADHead(nn.Module):
r""" A SQuAD head inspired by XLNet.
Parameters:
config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
Inputs:
**hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
......@@ -682,7 +708,7 @@ class SequenceSummary(nn.Module):
self.last_dropout = nn.Dropout(config.summary_last_dropout)
def forward(self, hidden_states, cls_index=None):
""" hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
""" hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer.
cls_index: [optional] position of the classification token if summary_type == 'cls_index',
shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
if summary_type == 'cls_index' and cls_index is None:
......
......@@ -63,7 +63,7 @@ def gelu(x):
GELU activation
https://arxiv.org/abs/1606.08415
https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
https://github.com/huggingface/pytorch-transformers/blob/master/modeling.py
https://github.com/huggingface/transformers/blob/master/modeling.py
"""
# return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
......@@ -265,9 +265,9 @@ XLM_START_DOCSTRING = r""" The XLM model was proposed in
https://github.com/facebookresearch/XLM
Parameters:
config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
XLM_INPUTS_DOCSTRING = r"""
......@@ -278,9 +278,9 @@ XLM_INPUTS_DOCSTRING = r"""
XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
Indices can be obtained using :class:`transformers.XLMTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``:
......@@ -337,11 +337,6 @@ class XLMModel(XLMPreTrainedModel):
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
"""
ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output',
'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads',
'hidden_dim', 'dropout', 'attention_dropout', 'asm',
'asm_cutoffs', 'asm_div_value']
def __init__(self, config): #, dico, is_encoder, with_output):
super(XLMModel, self).__init__(config)
self.output_attentions = config.output_attentions
......@@ -568,10 +563,10 @@ class XLMPredLayer(nn.Module):
"""
outputs = ()
if self.asm is False:
scores = self.proj(x).view(-1, self.n_words)
scores = self.proj(x)
outputs = (scores,) + outputs
if y is not None:
loss = F.cross_entropy(scores, y, reduction='elementwise_mean')
loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction='elementwise_mean')
outputs = (loss,) + outputs
else:
scores = self.proj.log_prob(x)
......@@ -723,6 +718,101 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class XLMForQuestionAnsweringSimple(XLMPreTrainedModel):
r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels whether a question has an answer or no answer (SQuAD 2.0)
**cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
**p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss, start_scores, end_scores = outputs[:2]
"""
def __init__(self, config):
super(XLMForQuestionAnsweringSimple, self).__init__(config)
self.transformer = XLMModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None):
transformer_outputs = self.transformer(input_ids,
attention_mask=attention_mask,
langs=langs,
token_type_ids=token_type_ids,
position_ids=position_ids,
lengths=lengths,
cache=cache,
head_mask=head_mask)
sequence_output = transformer_outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,)
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs
outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here
return outputs
@add_start_docstrings("""XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
class XLMForQuestionAnswering(XLMPreTrainedModel):
r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
......
......@@ -488,9 +488,9 @@ XLNET_START_DOCSTRING = r""" The XLNet model was proposed in
https://pytorch.org/docs/stable/nn.html#module
Parameters:
config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
"""
XLNET_INPUTS_DOCSTRING = r"""
......@@ -499,9 +499,9 @@ XLNET_INPUTS_DOCSTRING = r"""
Indices of input sequence tokens in the vocabulary.
XLNet is a model with relative position embeddings so you can either pad the inputs on
the right or on the left.
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
Indices can be obtained using :class:`transformers.XLNetTokenizer`.
See :func:`transformers.PreTrainedTokenizer.encode` and
:func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
......@@ -531,8 +531,10 @@ XLNET_INPUTS_DOCSTRING = r"""
Only used during pretraining for partial prediction or for sequential decoding (generation).
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
The embeddings from these tokens will be summed with the respective token embeddings.
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and
the important thing is that they should be different for tokens which belong to different segments.
The model will compute relative segment differences from the given type indices:
0 if the segment id of two tokens are the same, 1 if not.
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
Mask to avoid performing attention on padding token indices.
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
......@@ -1103,6 +1105,101 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel):
return outputs # return (loss), logits, mems, (hidden states), (attentions)
@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel):
r"""
**start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
**end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (`sequence_length`).
Position outside of the sequence are not taken into account for computing the loss.
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
**start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-start scores (before SoftMax).
**end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
Span-end scores (before SoftMax).
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context.
See details in the docstring of the `mems` input above.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
loss, start_scores, end_scores = outputs[:2]
"""
def __init__(self, config):
super(XLNetForQuestionAnsweringSimple, self).__init__(config)
self.num_labels = config.num_labels
self.transformer = XLNetModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
token_type_ids=None, input_mask=None, head_mask=None,
start_positions=None, end_positions=None):
outputs = self.transformer(input_ids,
attention_mask=attention_mask,
mems=mems,
perm_mask=perm_mask,
target_mapping=target_mapping,
token_type_ids=token_type_ids,
input_mask=input_mask,
head_mask=head_mask)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,) + outputs[2:]
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions.clamp_(0, ignored_index)
end_positions.clamp_(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
outputs = (total_loss,) + outputs
return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
the hidden-states output to compute `span start logits` and `span end logits`). """,
XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment