Commit 1a79eae3 authored by Zihan Wang's avatar Zihan Wang
Browse files

longformer

parent 14342e4e
# Longformer: The Long-Document Transformer
## Modifications from Huggingface's Implementation
All models require a `global_attention_size` specified in the config,
setting a global attention for all first `global_attention_size` tokens in any sentence.
Individual different global attention sizes for sentences are not supported.
This setting allows running on TPUs where tensor sizes have to be determined.
`_get_global_attn_indices` in `longformer_attention.py` contains how the new global attention indices are specified.
Changed all `tf.cond` to if confiditions, since global attention is specified in the start now.
`sentence_prediction_with_checkpoint_convert.py` now contains a `initial_parameters_from_pk` parameter that
specified a pk file containing all pre-trained weights from a pytorch longformer, which can be loaded into the
tf model.
The pk file can be generated from `utils/get_parameters_from_pretrained_pytorch_checkpoint.py`.
There is also a `longformer_tokenizer_to_tfrecord.py` that transformers pytorch longformer tokenized data to tf_records.
## Running
```bash
python utils/get_parameters_from_pretrained_pytorch_checkpoint.py
TRAIN_DATA=task.train_data.input_path=gs://model-garden-ucsd-zihan/longformer_allenai_mnli_train.tf_record,task.validation_data.input_path=gs://model-garden-ucsd-zihan/longformer_allenai_mnli_eval.tf_record
PYTHONPATH=/path/to/model/garden \
python3 train.py \
--experiment=longformer/glue \
--config_file=experiments/glue_mnli_allenai.yaml \
--params_override="${TRAIN_DATA},runtime.distribution_strategy=tpu,task.initial_parameters_from_pk=allenai_longformer-base-4096.pk" \
--tpu=local \
--model_dir=/path/to/outputdir \
--mode=train_and_eval
```
task:
hub_module_url: ''
model:
num_classes: 3
encoder:
type: any
any:
max_position_embeddings: 512
attention_window: [32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32]
global_attention_size: 1
metric_type: 'accuracy'
train_data:
drop_remainder: true
global_batch_size: 32
input_path: TODO
is_training: true
seq_length: 128
validation_data:
drop_remainder: true
global_batch_size: 32
input_path: TODO
is_training: false
seq_length: 128
trainer:
checkpoint_interval: 1000
continuous_eval_timeout: 7200
optimizer_config:
learning_rate:
polynomial:
decay_steps: 61359
end_learning_rate: 0.0
initial_learning_rate: 3.0e-05
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 6136
type: polynomial
steps_per_loop: 100
summary_interval: 100
# Training data size 392,702 examples, 5 epochs.
train_steps: 61359
validation_interval: 2000
validation_steps: 307
task:
hub_module_url: ''
model:
num_classes: 3
encoder:
type: any
any:
max_position_embeddings: 4098
attention_window: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128]
global_attention_size: 1
vocab_size: 50265
metric_type: 'accuracy'
train_data:
drop_remainder: true
global_batch_size: 32
input_path: TODO
is_training: true
seq_length: 512
validation_data:
drop_remainder: true
global_batch_size: 32
input_path: TODO
is_training: false
seq_length: 512
trainer:
checkpoint_interval: 1000
continuous_eval_timeout: 7200
optimizer_config:
learning_rate:
polynomial:
decay_steps: 61359
end_learning_rate: 0.0
initial_learning_rate: 3.0e-05
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 6136
type: polynomial
steps_per_loop: 1000
summary_interval: 1000
# Training data size 392,702 examples, 5 epochs.
train_steps: 61359
validation_interval: 2000
validation_steps: 307
This diff is collapsed.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Longformer model configurations and instantiation methods."""
import dataclasses
import gin
import tensorflow as tf
from official.modeling import tf_utils
from official.modeling.hyperparams import base_config
from official.nlp.configs import encoders
from official.projects.longformer.longformer_encoder import LongformerEncoder
from typing import List
@dataclasses.dataclass
class LongformerEncoderConfig(encoders.BertEncoderConfig):
'''Extra paramerters for Longformer configs
Args:
attention_window: list of ints representing the window size for each layer.
global_attention_size: the size of global attention used for each token.
'''
attention_window: List[int] = dataclasses.field(default_factory=list)
global_attention_size: int = 0
@gin.configurable
@base_config.bind(LongformerEncoderConfig)
def get_encoder(encoder_cfg: LongformerEncoderConfig):
"""Gets a 'LongformerEncoder' object.
Args:
encoder_cfg: A 'LongformerEncoderConfig'.
Returns:
A encoder object.
"""
encoder = LongformerEncoder(
attention_window=encoder_cfg.attention_window,
global_attention_size=encoder_cfg.global_attention_size,
vocab_size=encoder_cfg.vocab_size,
hidden_size=encoder_cfg.hidden_size,
num_layers=encoder_cfg.num_layers,
num_attention_heads=encoder_cfg.num_attention_heads,
intermediate_size=encoder_cfg.intermediate_size,
activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
dropout_rate=encoder_cfg.dropout_rate,
attention_dropout_rate=encoder_cfg.attention_dropout_rate,
max_sequence_length=encoder_cfg.max_position_embeddings,
type_vocab_size=encoder_cfg.type_vocab_size,
initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range),
output_range=encoder_cfg.output_range,
embedding_width=encoder_cfg.embedding_size,
norm_first=encoder_cfg.norm_first
)
return encoder
This diff is collapsed.
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the attention layer."""
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.projects.longformer import longformer_attention
def _create_mock_attention_data(
num_heads,
key_dim,
value_dim,
q_seq_length,
kv_seq_length,
batch_size,
include_mask=False):
"""Creates mock testing data.
Args:
num_heads: `int`, Number of attention heads.
key_dim: `int`, Size of query head.
value_dim: `int`, Size of key, value dim.
seq_length: `int`, Sequence length of the input.
batch_size: `int`, the batch size.
include_mask: optional `bool`, whether or not to include mask data.
Returns:
A dictionary with `str` as keys and `Tensor` as values.
"""
query_shape = (batch_size, q_seq_length, key_dim)
value_shape = (batch_size, kv_seq_length, value_dim)
data = dict(
query=tf.random.normal(shape=query_shape),
value=tf.random.normal(shape=value_shape),
key=tf.random.normal(shape=value_shape))
total_seq_length = kv_seq_length
if include_mask:
mask_shape = (batch_size, num_heads, q_seq_length, total_seq_length)
mask_data = np.random.randint(2, size=mask_shape).astype("float32")
mask_data = dict(attention_mask=mask_data)
data.update(mask_data)
return data
@keras_parameterized.run_all_keras_modes
class LongformerAttentionTest(keras_parameterized.TestCase):
def _get_hidden_states(self):
return tf.convert_to_tensor(
[
[
[
4.98332758e-01,
2.69175139e00,
-7.08081422e-03,
1.04915401e00,
-1.83476661e00,
7.67220476e-01,
2.98580543e-01,
2.84803992e-02,
],
[
-7.58357372e-01,
4.20635998e-01,
-4.04739919e-02,
1.59924145e-01,
2.05135748e00,
-1.15997978e00,
5.37166397e-01,
2.62873606e-01,
],
[
-1.69438001e00,
4.17574660e-01,
-1.49196962e00,
-1.76483717e00,
-1.94566312e-01,
-1.71183858e00,
7.72903565e-01,
-1.11557056e00,
],
[
5.44028163e-01,
2.05466114e-01,
-3.63045868e-01,
2.41865062e-01,
3.20348382e-01,
-9.05611176e-01,
-1.92690727e-01,
-1.19917547e00,
],
]
],
dtype=tf.float32,
)
def test_diagonalize(self):
hidden_states = self._get_hidden_states()
hidden_states = tf.reshape(hidden_states, (1, 8, 4)) # set seq length = 8, hidden dim = 4
chunked_hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2)
window_overlap_size = longformer_attention.shape_list(chunked_hidden_states)[2]
self.assertTrue(window_overlap_size == 4)
padded_hidden_states = longformer_attention.LongformerAttention._pad_and_diagonalize(chunked_hidden_states)
self.assertTrue(
longformer_attention.shape_list(padded_hidden_states)[-1] == longformer_attention.shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1
)
# first row => [0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000]
tf.debugging.assert_near(padded_hidden_states[0, 0, 0, :4], chunked_hidden_states[0, 0, 0], rtol=1e-3)
tf.debugging.assert_near(padded_hidden_states[0, 0, 0, 4:], tf.zeros((3,), dtype=tf.dtypes.float32), rtol=1e-3)
# last row => [0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629]
tf.debugging.assert_near(padded_hidden_states[0, 0, -1, 3:], chunked_hidden_states[0, 0, -1], rtol=1e-3)
tf.debugging.assert_near(
padded_hidden_states[0, 0, -1, :3], tf.zeros((3,), dtype=tf.dtypes.float32), rtol=1e-3
)
def test_pad_and_transpose_last_two_dims(self):
hidden_states = self._get_hidden_states()
self.assertTrue(longformer_attention.shape_list(hidden_states), [1, 8, 4])
# pad along seq length dim
paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]], dtype=tf.dtypes.int32)
hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2)
padded_hidden_states = longformer_attention.LongformerAttention._pad_and_transpose_last_two_dims(hidden_states, paddings)
self.assertTrue(longformer_attention.shape_list(padded_hidden_states) == [1, 1, 8, 5])
expected_added_dim = tf.zeros((5,), dtype=tf.dtypes.float32)
tf.debugging.assert_near(expected_added_dim, padded_hidden_states[0, 0, -1, :], rtol=1e-6)
tf.debugging.assert_near(
hidden_states[0, 0, -1, :], tf.reshape(padded_hidden_states, (1, -1))[0, 24:32], rtol=1e-6
)
def test_mask_invalid_locations(self):
hidden_states = self._get_hidden_states()
batch_size = 1
seq_length = 8
hidden_size = 4
hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size))
hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2)
hid_states_1 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states, 1)
hid_states_2 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states, 2)
hid_states_3 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states[:, :, :, :3], 2)
hid_states_4 = longformer_attention.LongformerAttention._mask_invalid_locations(hidden_states[:, :, 2:, :], 2)
self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_1), tf.dtypes.int32)) == 8)
self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_2), tf.dtypes.int32)) == 24)
self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_3), tf.dtypes.int32)) == 24)
self.assertTrue(tf.math.reduce_sum(tf.cast(tf.math.is_inf(hid_states_4), tf.dtypes.int32)) == 12)
def test_chunk(self):
hidden_states = self._get_hidden_states()
batch_size = 1
seq_length = 8
hidden_size = 4
hidden_states = tf.reshape(hidden_states, (batch_size, seq_length, hidden_size))
chunked_hidden_states = longformer_attention.LongformerAttention._chunk(hidden_states, window_overlap=2)
# expected slices across chunk and seq length dim
expected_slice_along_seq_length = tf.convert_to_tensor([0.4983, -0.7584, -1.6944], dtype=tf.dtypes.float32)
expected_slice_along_chunk = tf.convert_to_tensor([0.4983, -1.8348, -0.7584, 2.0514], dtype=tf.dtypes.float32)
self.assertTrue(longformer_attention.shape_list(chunked_hidden_states) == [1, 3, 4, 4])
tf.debugging.assert_near(chunked_hidden_states[0, :, 0, 0], expected_slice_along_seq_length, rtol=1e-3)
tf.debugging.assert_near(chunked_hidden_states[0, 0, :, 0], expected_slice_along_chunk, rtol=1e-3)
def test_layer_local_attn(self):
hidden_states = self._get_hidden_states()
batch_size, seq_length, hidden_size = hidden_states.shape
layer = longformer_attention.LongformerAttention(
num_heads=2,
key_dim=4,
value_dim=4,
layer_id=0,
attention_window=4,
global_attention_size=0,
)
attention_mask = tf.zeros((batch_size, seq_length), dtype=tf.dtypes.float32)
is_index_global_attn = tf.math.greater(attention_mask, 1)
is_global_attn = tf.math.reduce_any(is_index_global_attn)
attention_mask = tf.where(tf.range(4)[None, :, None, None] > 1, -10000.0, attention_mask[:, :, None, None])
is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0)
output_hidden_states = layer(
hidden_states=hidden_states, attention_mask=attention_mask,
is_index_masked=is_index_masked, is_index_global_attn=is_index_global_attn, is_global_attn=is_global_attn,
)[0]
self.assertTrue(output_hidden_states.shape, (1, 4, 8))
def test_layer_global_attn(self):
layer = longformer_attention.LongformerAttention(
num_heads=2,
key_dim=4,
value_dim=4,
layer_id=0,
attention_window=4,
global_attention_size=1,
)
hidden_states = self._get_hidden_states()
hidden_states = tf.concat([self._get_hidden_states(), self._get_hidden_states() - 0.5], axis=0)
batch_size, seq_length, hidden_size = hidden_states.shape
# create attn mask
attention_mask_1 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32)
attention_mask_2 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32)
attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] == 0, 10000.0, attention_mask_1)
attention_mask_1 = tf.where(tf.range(4)[None, :, None, None] > 2, -10000.0, attention_mask_1)
attention_mask_2 = tf.where(tf.range(4)[None, :, None, None] == 0, 10000.0, attention_mask_2)
attention_mask = tf.concat([attention_mask_1, attention_mask_2], axis=0)
is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0)
is_index_global_attn = tf.math.greater(attention_mask[:, :, 0, 0], 0)
is_global_attn = tf.math.reduce_any(is_index_global_attn)
output_hidden_states = layer(
hidden_states=hidden_states, attention_mask=-tf.math.abs(attention_mask),
is_index_masked=is_index_masked, is_index_global_attn=is_index_global_attn, is_global_attn=is_global_attn,
)[0]
self.assertTrue(output_hidden_states.shape, (2, 4, 8))
if __name__ == "__main__":
np.random.seed(0)
tf.random.set_seed(0)
tf.test.main()
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Longformer encoder. Modified From huggingface/transformers
"""
# pylint: disable=g-classes-have-attributes
from typing import Any, Callable, Optional, Union, List
from absl import logging
import tensorflow as tf
from official.nlp.modeling import layers
from official.projects.longformer.longformer_encoder_block import LongformerEncoderBlock
def shape_list(tensor: tf.Tensor) -> List[int]:
"""
Deal with dynamic shape in tensorflow cleanly.
Args:
tensor (:obj:`tf.Tensor`): The tensor we want the shape of.
Returns:
:obj:`List[int]`: The shape of the tensor as a list.
"""
dynamic = tf.shape(tensor)
if tensor.shape == tf.TensorShape(None):
return dynamic
static = tensor.shape.as_list()
return [dynamic[i] if s is None else s for i, s in enumerate(static)]
_Initializer = Union[str, tf.keras.initializers.Initializer]
_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
# Transferred from huggingface.longformer.TFLongformerMainLayer & TFLongformerEncoder
class LongformerEncoder(tf.keras.layers.Layer):
"""Bi-directional Transformer-based encoder network.
This network implements a bi-directional Transformer-based encoder as
described in "BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
embedding lookups and transformer layers, but not the masked language model
or classification task networks.
The default values for this object are taken from the BERT-Base implementation
in "BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding".
Args:
vocab_size: The size of the token vocabulary.
hidden_size: The size of the transformer hidden layers.
num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
type_vocab_size: The number of types that the 'type_ids' input can take.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network for each transformer.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network for each transformer.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: The dropout rate to use for the attention layers within
the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yields the full
output.
embedding_width: The width of the word embeddings. If the embedding width is
not equal to hidden size, embedding parameters will be factorized into two
matrices in the shape of ['vocab_size', 'embedding_width'] and
['embedding_width', 'hidden_size'] ('embedding_width' is usually much
smaller than 'hidden_size').
embedding_layer: An optional Layer instance which will be called to generate
embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is
normalized.
"""
def __init__(
self,
vocab_size: int,
attention_window: Union[List[int], int] = 512,
global_attention_size: int = 0,
pad_token_id: int = 1,
hidden_size: int = 768,
num_layers: int = 12,
num_attention_heads: int = 12,
max_sequence_length: int = 512,
type_vocab_size: int = 16,
inner_dim: int = 3072,
inner_activation: Callable[..., Any] = _approx_gelu,
output_dropout: float = 0.1,
attention_dropout: float = 0.1,
initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
stddev=0.02),
output_range: Optional[int] = None,
embedding_width: Optional[int] = None,
embedding_layer: Optional[tf.keras.layers.Layer] = None,
norm_first: bool = False,
**kwargs):
# Pops kwargs that are used in V1 implementation.
if 'dict_outputs' in kwargs:
kwargs.pop('dict_outputs')
if 'return_all_encoder_outputs' in kwargs:
kwargs.pop('return_all_encoder_outputs')
if 'intermediate_size' in kwargs:
inner_dim = kwargs.pop('intermediate_size')
if 'activation' in kwargs:
inner_activation = kwargs.pop('activation')
if 'dropout_rate' in kwargs:
output_dropout = kwargs.pop('dropout_rate')
if 'attention_dropout_rate' in kwargs:
attention_dropout = kwargs.pop('attention_dropout_rate')
super().__init__(**kwargs)
# Longformer
self._attention_window = attention_window
self.global_attention_size = global_attention_size
self._pad_token_id = pad_token_id
activation = tf.keras.activations.get(inner_activation)
initializer = tf.keras.initializers.get(initializer)
if embedding_width is None:
embedding_width = hidden_size
if embedding_layer is None:
self._embedding_layer = layers.OnDeviceEmbedding(
vocab_size=vocab_size,
embedding_width=embedding_width,
initializer=initializer,
name='word_embeddings')
else:
self._embedding_layer = embedding_layer
self._position_embedding_layer = layers.PositionEmbedding(
initializer=initializer,
max_length=max_sequence_length,
name='position_embedding')
self._type_embedding_layer = layers.OnDeviceEmbedding(
vocab_size=type_vocab_size,
embedding_width=embedding_width,
initializer=initializer,
use_one_hot=True,
name='type_embeddings')
self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
self._embedding_dropout = tf.keras.layers.Dropout(
rate=output_dropout, name='embedding_dropout')
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
self._embedding_projection = None
if embedding_width != hidden_size:
self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
'...x,xy->...y',
output_shape=hidden_size,
bias_axes='y',
kernel_initializer=initializer,
name='embedding_projection')
self._transformer_layers = []
self._attention_mask_layer = layers.SelfAttentionMask(
name='self_attention_mask')
for i in range(num_layers):
layer = LongformerEncoderBlock(
global_attention_size=global_attention_size,
num_attention_heads=num_attention_heads,
inner_dim=inner_dim,
inner_activation=inner_activation,
# Longformer, instead of passing a list of attention_window, pass a value to sub-block
attention_window=attention_window if isinstance(attention_window, int) else attention_window[i],
layer_id=i,
output_dropout=output_dropout,
attention_dropout=attention_dropout,
norm_first=norm_first,
output_range=output_range if i == num_layers - 1 else None,
kernel_initializer=initializer,
name='transformer/layer_%d' % i)
self._transformer_layers.append(layer)
self._pooler_layer = tf.keras.layers.Dense(
units=hidden_size,
activation='tanh',
kernel_initializer=initializer,
name='pooler_transform')
self._config = {
'vocab_size': vocab_size,
'hidden_size': hidden_size,
'num_layers': num_layers,
'num_attention_heads': num_attention_heads,
'max_sequence_length': max_sequence_length,
'type_vocab_size': type_vocab_size,
'inner_dim': inner_dim,
'inner_activation': tf.keras.activations.serialize(activation),
'output_dropout': output_dropout,
'attention_dropout': attention_dropout,
'initializer': tf.keras.initializers.serialize(initializer),
'output_range': output_range,
'embedding_width': embedding_width,
'embedding_layer': embedding_layer,
'norm_first': norm_first,
# Longformer
'attention_window': attention_window,
'pad_token_id': pad_token_id,
}
self.inputs = dict(
input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32))
def call(self, inputs):
word_embeddings = None
if isinstance(inputs, dict):
word_ids = inputs.get('input_word_ids') # input_ids
mask = inputs.get('input_mask') # attention_mask
type_ids = inputs.get('input_type_ids') # token_type_ids
word_embeddings = inputs.get('input_word_embeddings', None) # input_embeds
else:
raise ValueError('Unexpected inputs type to %s.' % self.__class__)
(
padding_len,
word_ids,
mask,
type_ids,
word_embeddings,
) = self._pad_to_window_size(
word_ids=word_ids,
mask=mask,
type_ids=type_ids,
word_embeddings=word_embeddings,
pad_token_id=self._pad_token_id
)
if word_embeddings is None:
word_embeddings = self._embedding_layer(word_ids)
# absolute position embeddings.
position_embeddings = self._position_embedding_layer(word_embeddings)
type_embeddings = self._type_embedding_layer(type_ids)
embeddings = word_embeddings + position_embeddings + type_embeddings
embeddings = self._embedding_norm_layer(embeddings)
embeddings = self._embedding_dropout(embeddings)
if self._embedding_projection is not None:
embeddings = self._embedding_projection(embeddings)
batch_size, seq_len = shape_list(mask)
# create masks with fixed len global_attention_size
mask = tf.transpose(tf.concat(values=[tf.ones((self.global_attention_size, batch_size), tf.int32) * 2,
tf.transpose(mask)[self.global_attention_size:]], axis=0))
is_index_masked = tf.math.less(mask, 1)
is_index_global_attn = tf.transpose(tf.concat(values=[
tf.ones((self.global_attention_size, batch_size), tf.bool), tf.zeros((seq_len - self.global_attention_size, batch_size), tf.bool)
], axis=0))
is_global_attn = self.global_attention_size > 0
# Longformer
attention_mask = mask
extended_attention_mask = tf.reshape(
attention_mask, (tf.shape(mask)[0], tf.shape(mask)[1], 1, 1)
)
attention_mask = tf.cast(tf.math.abs(1 - extended_attention_mask), tf.dtypes.float32) * -10000.0
encoder_outputs = []
x = embeddings
# TFLongformerEncoder
for i, layer in enumerate(self._transformer_layers):
x = layer([
x,
attention_mask,
is_index_masked,
is_index_global_attn,
is_global_attn])
encoder_outputs.append(x)
last_encoder_output = encoder_outputs[-1]
if padding_len > 0:
last_encoder_output = last_encoder_output[:, :-padding_len]
first_token_tensor = last_encoder_output[:, 0, :]
pooled_output = self._pooler_layer(first_token_tensor)
return dict(
sequence_output=last_encoder_output,
pooled_output=pooled_output,
encoder_outputs=encoder_outputs)
def get_embedding_table(self):
return self._embedding_layer.embeddings
def get_embedding_layer(self):
return self._embedding_layer
def get_config(self):
return dict(self._config)
@property
def transformer_layers(self):
"""List of Transformer layers in the encoder."""
return self._transformer_layers
@property
def pooler_layer(self):
"""The pooler dense layer after the transformer layers."""
return self._pooler_layer
@classmethod
def from_config(cls, config, custom_objects=None):
if 'embedding_layer' in config and config['embedding_layer'] is not None:
warn_string = (
'You are reloading a model that was saved with a '
'potentially-shared embedding layer object. If you contine to '
'train this model, the embedding layer will no longer be shared. '
'To work around this, load the model outside of the Keras API.')
print('WARNING: ' + warn_string)
logging.warn(warn_string)
return cls(**config)
def _pad_to_window_size(
self,
word_ids, # input_ids
mask, # attention_mask
type_ids, # token_type_ids
word_embeddings, # inputs_embeds
pad_token_id, # pad_token_id
):
"""A helper function to pad tokens and mask to work with implementation of Longformer selfattention."""
# padding
attention_window = (
self._attention_window if isinstance(self._attention_window, int) else max(self._attention_window)
)
assert attention_window % 2 == 0, f"`attention_window` should be an even value. Given {attention_window}"
# input_shape = shape_list(input_ids) if input_ids is not None else shape_list(inputs_embeds)
input_shape = word_ids.shape if word_ids is not None else word_embeddings.shape
batch_size, seq_len = input_shape[:2]
if seq_len is not None:
padding_len = (attention_window - seq_len % attention_window) % attention_window
else:
padding_len = 0
paddings = tf.convert_to_tensor([[0, 0], [0, padding_len]])
if word_ids is not None:
word_ids = tf.pad(word_ids, paddings, constant_values=pad_token_id)
if word_embeddings is not None:
def pad_embeddings():
word_ids_padding = tf.fill((batch_size, padding_len), self.pad_token_id)
word_embeddings_padding = self._embedding_layer(word_ids_padding)
return tf.concat([word_embeddings, word_embeddings_padding], axis=-2)
word_embeddings = tf.cond(tf.math.greater(padding_len, 0), pad_embeddings, lambda: word_embeddings)
mask = tf.pad(mask, paddings, constant_values=False) # no attention on the padding tokens
token_type_ids = tf.pad(type_ids, paddings, constant_values=0) # pad with token_type_id = 0
return (
padding_len,
word_ids,
mask,
token_type_ids,
word_embeddings,)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Longformer attention layer. Modified From huggingface/transformers
"""
import tensorflow as tf
from official.projects.longformer.longformer_attention import LongformerAttention
@tf.keras.utils.register_keras_serializable(package="Text")
class LongformerEncoderBlock(tf.keras.layers.Layer):
"""TransformerEncoderBlock layer.
This layer implements the Transformer Encoder from
"Attention Is All You Need". (https://arxiv.org/abs/1706.03762),
which combines a `tf.keras.layers.MultiHeadAttention` layer with a
two-layer feedforward network.
References:
[Attention Is All You Need](https://arxiv.org/abs/1706.03762)
[BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding](https://arxiv.org/abs/1810.04805)
"""
def __init__(self,
global_attention_size,
num_attention_heads,
inner_dim,
inner_activation,
# Longformer
attention_window,
layer_id=0,
output_range=None,
kernel_initializer="glorot_uniform",
bias_initializer="zeros",
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
use_bias=True,
norm_first=False,
norm_epsilon=1e-12,
output_dropout=0.0,
attention_dropout=0.0,
inner_dropout=0.0,
attention_initializer=None,
attention_axes=None,
**kwargs):
"""Initializes `TransformerEncoderBlock`.
Args:
num_attention_heads: Number of attention heads.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network.
output_range: the sequence output range, [0, output_range) for slicing the
target sequence. `None` means the target sequence is not sliced.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
use_bias: Whether to enable use_bias in attention layer. If set False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: Dropout probability for within the attention layer.
inner_dropout: Dropout probability for the first Dense layer in a
two-layer feedforward network.
attention_initializer: Initializer for kernels of attention layers. If set
`None`, attention layers use kernel_initializer as initializer for
kernel.
attention_axes: axes over which the attention is applied. `None` means
attention over all axes, but batch, heads, and features.
**kwargs: keyword arguments/
"""
super().__init__(**kwargs)
self.global_attention_size = global_attention_size
self._num_heads = num_attention_heads
self._inner_dim = inner_dim
self._inner_activation = inner_activation
# Longformer
self._attention_window = attention_window
self._layer_id = layer_id
self._attention_dropout = attention_dropout
self._attention_dropout_rate = attention_dropout
self._output_dropout = output_dropout
self._output_dropout_rate = output_dropout
self._output_range = output_range
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._bias_initializer = tf.keras.initializers.get(bias_initializer)
self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint)
self._use_bias = use_bias
self._norm_first = norm_first
self._norm_epsilon = norm_epsilon
self._inner_dropout = inner_dropout
if attention_initializer:
self._attention_initializer = tf.keras.initializers.get(
attention_initializer)
else:
self._attention_initializer = self._kernel_initializer
self._attention_axes = attention_axes
def build(self, input_shape):
if isinstance(input_shape, tf.TensorShape):
input_tensor_shape = input_shape
elif isinstance(input_shape, (list, tuple)):
input_tensor_shape = tf.TensorShape(input_shape[0])
else:
raise ValueError(
"The type of input shape argument is not supported, got: %s" %
type(input_shape))
einsum_equation = "abc,cd->abd"
if len(input_tensor_shape.as_list()) > 3:
einsum_equation = "...bc,cd->...bd"
hidden_size = input_tensor_shape[-1]
if hidden_size % self._num_heads != 0:
raise ValueError(
"The input size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, self._num_heads))
self._attention_head_size = int(hidden_size // self._num_heads)
common_kwargs = dict(
bias_initializer=self._bias_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint)
# TFLongformerSelfAttention + TFLongformerSelfOutput.dense
self._attention_layer = LongformerAttention(
# Longformer
layer_id=self._layer_id,
global_attention_size=self.global_attention_size,
attention_window=self._attention_window,
num_heads=self._num_heads,
key_dim=self._attention_head_size,
dropout=self._attention_dropout,
use_bias=self._use_bias,
kernel_initializer=self._attention_initializer,
attention_axes=self._attention_axes,
name="self_attention",
**common_kwargs)
# TFLongformerSelfOutput.dropout
self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
# TFLongformerSelfOutput.Layernorm
self._attention_layer_norm = (
tf.keras.layers.LayerNormalization(
name="self_attention_layer_norm",
axis=-1,
epsilon=self._norm_epsilon,
dtype=tf.float32))
# TFLongformerIntermediate
# TFLongformerIntermediate.dense
self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
einsum_equation,
output_shape=(None, self._inner_dim),
bias_axes="d",
kernel_initializer=self._kernel_initializer,
name="intermediate",
**common_kwargs)
policy = tf.keras.mixed_precision.global_policy()
if policy.name == "mixed_bfloat16":
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
# TODO(b/154538392): Investigate this.
policy = tf.float32
# TFLongformerIntermediate.intermediate_act_fn
self._intermediate_activation_layer = tf.keras.layers.Activation(
self._inner_activation, dtype=policy)
# ???
self._inner_dropout_layer = tf.keras.layers.Dropout(
rate=self._inner_dropout)
# TFLongformerOutput
# TFLongformerOutput.dense
self._output_dense = tf.keras.layers.experimental.EinsumDense(
einsum_equation,
output_shape=(None, hidden_size),
bias_axes="d",
name="output",
kernel_initializer=self._kernel_initializer,
**common_kwargs)
# TFLongformerOutput.dropout
self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
# Use float32 in layernorm for numeric stability.
# TFLongformerOutput.layernorm
self._output_layer_norm = tf.keras.layers.LayerNormalization(
name="output_layer_norm",
axis=-1,
epsilon=self._norm_epsilon,
dtype=tf.float32)
super(LongformerEncoderBlock, self).build(input_shape)
def get_config(self):
config = {
"num_attention_heads":
self._num_heads,
"inner_dim":
self._inner_dim,
"inner_activation":
self._inner_activation,
"output_dropout":
self._output_dropout_rate,
"attention_dropout":
self._attention_dropout_rate,
"output_range":
self._output_range,
"kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer":
tf.keras.initializers.serialize(self._bias_initializer),
"kernel_regularizer":
tf.keras.regularizers.serialize(self._kernel_regularizer),
"bias_regularizer":
tf.keras.regularizers.serialize(self._bias_regularizer),
"activity_regularizer":
tf.keras.regularizers.serialize(self._activity_regularizer),
"kernel_constraint":
tf.keras.constraints.serialize(self._kernel_constraint),
"bias_constraint":
tf.keras.constraints.serialize(self._bias_constraint),
"use_bias":
self._use_bias,
"norm_first":
self._norm_first,
"norm_epsilon":
self._norm_epsilon,
"inner_dropout":
self._inner_dropout,
"attention_initializer":
tf.keras.initializers.serialize(self._attention_initializer),
"attention_axes": self._attention_axes,
}
base_config = super(LongformerEncoderBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
"""Transformer self-attention encoder block call.
Args:
inputs: a single tensor or a list of tensors.
`input tensor` as the single sequence of embeddings.
[`input tensor`, `attention mask`] to have the additional attention
mask.
[`query tensor`, `key value tensor`, `attention mask`] to have separate
input streams for the query, and key/value to the multi-head
attention.
Returns:
An output tensor with the same dimensions as input/query tensor.
"""
if isinstance(inputs, (list, tuple)):
if len(inputs) == 5:
(
input_tensor,
attention_mask,
is_index_masked,
is_index_global_attn,
is_global_attn
) = inputs
key_value = None
elif len(inputs) == 6:
assert False # No key_value
else:
raise ValueError("Unexpected inputs to %s with length at %d" %
(self.__class__, len(inputs)))
else:
input_tensor = inputs
attention_mask = None
is_index_masked = None
is_index_global_attn = None
is_global_attn = None
key_value = None
if self._output_range:
if self._norm_first:
source_tensor = input_tensor[:, 0:self._output_range, :]
input_tensor = self._attention_layer_norm(input_tensor)
if key_value is not None:
key_value = self._attention_layer_norm(key_value)
target_tensor = input_tensor[:, 0:self._output_range, :]
if attention_mask is not None:
attention_mask = attention_mask[:, 0:self._output_range, :]
if is_index_masked is not None:
is_index_masked = is_index_masked[:, 0:self._output_range]
if is_index_global_attn is not None:
is_index_global_attn = is_index_global_attn[:, 0:self._output_range]
else:
if self._norm_first:
source_tensor = input_tensor
input_tensor = self._attention_layer_norm(input_tensor)
if key_value is not None:
key_value = self._attention_layer_norm(key_value)
target_tensor = input_tensor
if key_value is None:
key_value = input_tensor
# attention_output = self._attention_layer(
# query=target_tensor, value=key_value, attention_mask=attention_mask)
attention_output = self._attention_layer(
hidden_states=target_tensor,
attention_mask=attention_mask,
is_index_masked=is_index_masked,
is_index_global_attn=is_index_global_attn,
is_global_attn=is_global_attn
)
# TFLongformerAttention.TFLongformerSelfOutput.* - {.dense}
attention_output = self._attention_dropout(attention_output)
if self._norm_first:
attention_output = source_tensor + attention_output
else:
attention_output = self._attention_layer_norm(target_tensor +
attention_output)
if self._norm_first:
source_attention_output = attention_output
attention_output = self._output_layer_norm(attention_output)
# TFLongformerIntermediate
inner_output = self._intermediate_dense(attention_output)
inner_output = self._intermediate_activation_layer(inner_output)
inner_output = self._inner_dropout_layer(inner_output)
# TFLongformerOutput
layer_output = self._output_dense(inner_output)
layer_output = self._output_dropout(layer_output)
if self._norm_first:
return source_attention_output + layer_output
# During mixed precision training, layer norm output is always fp32 for now.
# Casts fp32 for the subsequent add.
layer_output = tf.cast(layer_output, tf.float32)
return self._output_layer_norm(layer_output + attention_output)
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.projects.bigbird.encoder."""
import numpy as np
import tensorflow as tf
from absl.testing import parameterized
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from tensorflow.python.distribute import combinations
from official.projects.longformer.longformer_encoder import LongformerEncoder
@keras_parameterized.run_all_keras_modes
class LongformerEncoderTest(keras_parameterized.TestCase):
@combinations.generate(combinations.combine(
attention_window=[32, 128], global_attention_size=[0, 1, 2]))
def test_encoder(self, attention_window, global_attention_size):
sequence_length = 128
batch_size = 2
vocab_size = 1024
hidden_size=256
network = LongformerEncoder(
global_attention_size=global_attention_size,
vocab_size=vocab_size,
attention_window=attention_window,
hidden_size=hidden_size,
num_layers=1,
num_attention_heads=4,
max_sequence_length=512)
word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length), dtype=np.int32)
mask_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32)
type_id_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32)
inputs = {
'input_word_ids': word_id_data,
'input_mask': mask_data,
'input_type_ids': type_id_data,
}
outputs = network(inputs)
self.assertEqual(outputs["sequence_output"].shape,
(batch_size, sequence_length, hidden_size))
@combinations.generate(combinations.combine(
norm_first=[True, False], global_attention_size=[0, 1, 2]))
def test_norm_first(self, norm_first, global_attention_size):
sequence_length = 128
batch_size = 2
vocab_size = 1024
hidden_size = 256
network = LongformerEncoder(
global_attention_size=global_attention_size,
vocab_size=vocab_size,
attention_window=32,
hidden_size=hidden_size,
num_layers=1,
num_attention_heads=4,
max_sequence_length=512,
norm_first=norm_first)
word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length), dtype=np.int32)
mask_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32)
type_id_data = np.random.randint(2, size=(batch_size, sequence_length), dtype=np.int32)
inputs = {
'input_word_ids': word_id_data,
'input_mask': mask_data,
'input_type_ids': type_id_data,
}
outputs = network(inputs)
self.assertEqual(outputs["sequence_output"].shape,
(batch_size, sequence_length, hidden_size))
if __name__ == "__main__":
tf.test.main()
\ No newline at end of file
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Longformer experiments
"""
# pylint: disable=g-doc-return-or-yield,line-too-long
import dataclasses
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import optimization
from official.nlp.data import pretrain_dataloader
from official.nlp.tasks import masked_lm
from official.nlp.data import sentence_prediction_dataloader
from official.nlp.configs import bert
from official.nlp.configs import encoders
import official.projects.longformer.sentence_prediction_with_load as sentence_prediction
from official.projects.longformer.longformer import LongformerEncoderConfig
AdamWeightDecay = optimization.AdamWeightDecayConfig
PolynomialLr = optimization.PolynomialLrConfig
PolynomialWarmupConfig = optimization.PolynomialWarmupConfig
@dataclasses.dataclass
class LongformerOptimizationConfig(optimization.OptimizationConfig):
optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig(
type="adamw",
adamw=AdamWeightDecay(
weight_decay_rate=0.01,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
epsilon=1e-6))
learning_rate: optimization.LrConfig = optimization.LrConfig(
type="polynomial",
polynomial=PolynomialLr(
initial_learning_rate=1e-4,
decay_steps=1000000,
end_learning_rate=0.0))
warmup: optimization.WarmupConfig = optimization.WarmupConfig(
type="polynomial", polynomial=PolynomialWarmupConfig(warmup_steps=10000))
@exp_factory.register_config_factory('longformer/pretraining')
def longformer_pretraining() -> cfg.ExperimentConfig:
"""BERT pretraining experiment."""
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(enable_xla=True),
task=masked_lm.MaskedLMConfig(
model=bert.PretrainerConfig(
encoder=encoders.EncoderConfig(
type="any", any=LongformerEncoderConfig()),
cls_heads=[
bert.ClsHeadConfig(
inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence')
]
),
train_data=pretrain_dataloader.BertPretrainDataConfig(use_v2_feature_names=True),
validation_data=pretrain_dataloader.BertPretrainDataConfig(use_v2_feature_names=True,
is_training=False)),
trainer=cfg.TrainerConfig(
optimizer_config=LongformerOptimizationConfig(), train_steps=1000000),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
@exp_factory.register_config_factory('longformer/glue')
def longformer_glue() -> cfg.ExperimentConfig:
config = cfg.ExperimentConfig(
task=sentence_prediction.SentencePredictionConfig(
model=sentence_prediction.ModelConfig(
encoder=encoders.EncoderConfig(
type="any", any=LongformerEncoderConfig())),
train_data=sentence_prediction_dataloader
.SentencePredictionDataConfig(),
validation_data=sentence_prediction_dataloader
.SentencePredictionDataConfig(
is_training=False, drop_remainder=False)),
trainer=cfg.TrainerConfig(
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'adamw',
'adamw': {
'weight_decay_rate':
0.01,
'exclude_from_weight_decay':
['LayerNorm', 'layer_norm', 'bias'],
}
},
'learning_rate': {
'type': 'polynomial',
'polynomial': {
'initial_learning_rate': 3e-5,
'end_learning_rate': 0.0,
}
},
'warmup': {
'type': 'polynomial'
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Sentence prediction (classification) task."""
import dataclasses
from typing import List, Union, Optional
from absl import logging
import numpy as np
import orbit
from scipy import stats
from sklearn import metrics as sklearn_metrics
import tensorflow as tf
from official.core import base_task
from official.core import config_definitions as cfg
from official.core import task_factory
from official.modeling import tf_utils
from official.modeling.hyperparams import base_config
from official.nlp.configs import encoders
from official.nlp.data import data_loader_factory
from official.nlp.modeling import models
from official.nlp.tasks import utils
import pickle
METRIC_TYPES = frozenset(
['accuracy', 'matthews_corrcoef', 'pearson_spearman_corr'])
@dataclasses.dataclass
class ModelConfig(base_config.Config):
"""A classifier/regressor configuration."""
num_classes: int = 0
use_encoder_pooler: bool = False
encoder: encoders.EncoderConfig = encoders.EncoderConfig()
@dataclasses.dataclass
class SentencePredictionConfig(cfg.TaskConfig):
"""The model config."""
# At most one of `init_checkpoint` and `hub_module_url` can
# be specified.
init_checkpoint: str = ''
init_cls_pooler: bool = False
initial_parameters_from_pk: str = ''
hub_module_url: str = ''
metric_type: str = 'accuracy'
# Defines the concrete model config at instantiation time.
model: ModelConfig = ModelConfig()
train_data: cfg.DataConfig = cfg.DataConfig()
validation_data: cfg.DataConfig = cfg.DataConfig()
@task_factory.register_task_cls(SentencePredictionConfig)
class SentencePredictionTask(base_task.Task):
"""Task object for sentence_prediction."""
def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None):
super().__init__(params, logging_dir, name=name)
if params.metric_type not in METRIC_TYPES:
raise ValueError('Invalid metric_type: {}'.format(params.metric_type))
self.metric_type = params.metric_type
if hasattr(params.train_data, 'label_field'):
self.label_field = params.train_data.label_field
else:
self.label_field = 'label_ids'
def build_model(self):
if self.task_config.hub_module_url and self.task_config.init_checkpoint:
raise ValueError('At most one of `hub_module_url` and '
'`init_checkpoint` can be specified.')
if self.task_config.hub_module_url:
encoder_network = utils.get_encoder_from_hub(
self.task_config.hub_module_url)
else:
encoder_network = encoders.build_encoder(self.task_config.model.encoder)
encoder_cfg = self.task_config.model.encoder.get()
if self.task_config.model.encoder.type == 'xlnet':
return models.XLNetClassifier(
network=encoder_network,
num_classes=self.task_config.model.num_classes,
initializer=tf.keras.initializers.RandomNormal(
stddev=encoder_cfg.initializer_range))
else:
return models.BertClassifier(
network=encoder_network,
num_classes=self.task_config.model.num_classes,
initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range),
use_encoder_pooler=self.task_config.model.use_encoder_pooler)
def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
label_ids = labels[self.label_field]
if self.task_config.model.num_classes == 1:
loss = tf.keras.losses.mean_squared_error(label_ids, model_outputs)
else:
loss = tf.keras.losses.sparse_categorical_crossentropy(
label_ids, tf.cast(model_outputs, tf.float32), from_logits=True)
if aux_losses:
loss += tf.add_n(aux_losses)
return tf_utils.safe_mean(loss)
def build_inputs(self, params, input_context=None):
"""Returns tf.data.Dataset for sentence_prediction task."""
if params.input_path == 'dummy':
def dummy_data(_):
dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
x = dict(
input_word_ids=dummy_ids,
input_mask=dummy_ids,
input_type_ids=dummy_ids)
if self.task_config.model.num_classes == 1:
y = tf.zeros((1,), dtype=tf.float32)
else:
y = tf.zeros((1, 1), dtype=tf.int32)
x[self.label_field] = y
return x
dataset = tf.data.Dataset.range(1)
dataset = dataset.repeat()
dataset = dataset.map(
dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
return dataset
return data_loader_factory.get_data_loader(params).load(input_context)
def build_metrics(self, training=None):
del training
if self.task_config.model.num_classes == 1:
metrics = [tf.keras.metrics.MeanSquaredError()]
elif self.task_config.model.num_classes == 2:
metrics = [
tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy'),
tf.keras.metrics.AUC(name='auc', curve='PR'),
]
else:
metrics = [
tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy'),
]
return metrics
def process_metrics(self, metrics, labels, model_outputs):
for metric in metrics:
if metric.name == 'auc':
# Convert the logit to probability and extract the probability of True..
metric.update_state(
labels[self.label_field],
tf.expand_dims(tf.nn.softmax(model_outputs)[:, 1], axis=1))
if metric.name == 'cls_accuracy':
metric.update_state(labels[self.label_field], model_outputs)
def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
compiled_metrics.update_state(labels[self.label_field], model_outputs)
def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
if self.metric_type == 'accuracy':
return super(SentencePredictionTask,
self).validation_step(inputs, model, metrics)
features, labels = inputs, inputs
outputs = self.inference_step(features, model)
loss = self.build_losses(
labels=labels, model_outputs=outputs, aux_losses=model.losses)
logs = {self.loss: loss}
if self.metric_type == 'matthews_corrcoef':
logs.update({
'sentence_prediction': # Ensure one prediction along batch dimension.
tf.expand_dims(tf.math.argmax(outputs, axis=1), axis=1),
'labels':
labels[self.label_field],
})
if self.metric_type == 'pearson_spearman_corr':
logs.update({
'sentence_prediction': outputs,
'labels': labels[self.label_field],
})
return logs
def aggregate_logs(self, state=None, step_outputs=None):
if self.metric_type == 'accuracy':
return None
if state is None:
state = {'sentence_prediction': [], 'labels': []}
state['sentence_prediction'].append(
np.concatenate([v.numpy() for v in step_outputs['sentence_prediction']],
axis=0))
state['labels'].append(
np.concatenate([v.numpy() for v in step_outputs['labels']], axis=0))
return state
def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
if self.metric_type == 'accuracy':
return None
elif self.metric_type == 'matthews_corrcoef':
preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
preds = np.reshape(preds, -1)
labels = np.concatenate(aggregated_logs['labels'], axis=0)
labels = np.reshape(labels, -1)
return {
self.metric_type: sklearn_metrics.matthews_corrcoef(preds, labels)
}
elif self.metric_type == 'pearson_spearman_corr':
preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
preds = np.reshape(preds, -1)
labels = np.concatenate(aggregated_logs['labels'], axis=0)
labels = np.reshape(labels, -1)
pearson_corr = stats.pearsonr(preds, labels)[0]
spearman_corr = stats.spearmanr(preds, labels)[0]
corr_metric = (pearson_corr + spearman_corr) / 2
return {self.metric_type: corr_metric}
def initialize(self, model):
"""Load a pretrained checkpoint (if exists) and then train from iter 0."""
ckpt_dir_or_file = self.task_config.init_checkpoint
if self.task_config.initial_parameters_from_pk:
num_layers = self.task_config.model.encoder.num_layers
num_attention_heads = self.task_config.model.encoder.num_attention_heads
hidden_size = self.task_config.model.encoder.hidden_size
inner_dim = self.task_config.model.encoder.inner_dim
head_size = hidden_size / num_attention_heads
assert head_size * num_attention_heads == hidden_size
encoder = model.checkpoint_items['encoder']
allenai_model = pickle.load(open(self.task_config.initial_parameters_from_pk, "rb"))
encoder._embedding_layer.set_weights(
[allenai_model["embeddings.word_embeddings.weight"]]
)
encoder._embedding_norm_layer.set_weights(
[allenai_model["embeddings.LayerNorm.weight"],
allenai_model["embeddings.LayerNorm.bias"]]
)
encoder._type_embedding_layer.set_weights(
[np.repeat(
allenai_model["embeddings.token_type_embeddings.weight"],
2,
axis=0
)]
)
encoder._position_embedding_layer.set_weights(
[allenai_model["embeddings.position_embeddings.weight"]]
)
encoder._pooler_layer.set_weights(
[allenai_model["pooler.dense.weight"],
allenai_model["pooler.dense.bias"]]
)
for layer_num in range(num_layers):
encoder._transformer_layers[layer_num]._attention_layer._global_key_dense.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.weight"].T.reshape((hidden_size, num_attention_heads, head_size)),
allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.bias"].reshape((num_attention_heads, head_size))]
)
encoder._transformer_layers[layer_num]._attention_layer._global_query_dense.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.attention.self.query_global.weight"].T.reshape((hidden_size, num_attention_heads, head_size)),
allenai_model[f"encoder.layer.{layer_num}.attention.self.query_global.bias"].reshape((num_attention_heads, head_size))]
)
encoder._transformer_layers[layer_num]._attention_layer._global_value_dense.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.attention.self.value_global.weight"].T.reshape((hidden_size, num_attention_heads, head_size)),
allenai_model[f"encoder.layer.{layer_num}.attention.self.value_global.bias"].reshape((num_attention_heads, head_size))]
)
encoder._transformer_layers[layer_num]._attention_layer._key_dense.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.attention.self.key.weight"].T.reshape((hidden_size, num_attention_heads, head_size)),
allenai_model[f"encoder.layer.{layer_num}.attention.self.key_global.bias"].reshape((num_attention_heads, head_size))]
)
encoder._transformer_layers[layer_num]._attention_layer._query_dense.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.attention.self.query.weight"].T.reshape((hidden_size, num_attention_heads, head_size)),
allenai_model[f"encoder.layer.{layer_num}.attention.self.query.bias"].reshape((num_attention_heads, head_size))]
)
encoder._transformer_layers[layer_num]._attention_layer._value_dense.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.attention.self.value.weight"].T.reshape((hidden_size, num_attention_heads, head_size)),
allenai_model[f"encoder.layer.{layer_num}.attention.self.value.bias"].reshape((num_attention_heads, head_size))]
)
encoder._transformer_layers[layer_num]._attention_layer._output_dense.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.attention.output.dense.weight"].T,
allenai_model[f"encoder.layer.{layer_num}.attention.output.dense.bias"]]
)
encoder._transformer_layers[layer_num]._attention_layer_norm.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.attention.output.LayerNorm.weight"],
allenai_model[f"encoder.layer.{layer_num}.attention.output.LayerNorm.bias"]]
)
encoder._transformer_layers[layer_num]._intermediate_dense.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.weight"].T,
allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.bias"]]
)
encoder._transformer_layers[layer_num]._output_dense.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.output.dense.weight"].T,
allenai_model[f"encoder.layer.{layer_num}.output.dense.bias"]]
)
encoder._transformer_layers[layer_num]._output_layer_norm.set_weights(
[allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.weight"],
allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.bias"]]
)
if not ckpt_dir_or_file:
return
if tf.io.gfile.isdir(ckpt_dir_or_file):
ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
pretrain2finetune_mapping = {
'encoder': model.checkpoint_items['encoder'],
}
if self.task_config.init_cls_pooler:
# This option is valid when use_encoder_pooler is false.
pretrain2finetune_mapping[
'next_sentence.pooler_dense'] = model.checkpoint_items[
'sentence_prediction.pooler_dense']
ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping)
status = ckpt.read(ckpt_dir_or_file)
status.expect_partial().assert_existing_objects_matched()
logging.info('Finished loading pretrained checkpoint from %s',
ckpt_dir_or_file)
def predict(task: SentencePredictionTask,
params: cfg.DataConfig,
model: tf.keras.Model,
params_aug: Optional[cfg.DataConfig] = None,
test_time_aug_wgt: float = 0.3) -> List[Union[int, float]]:
"""Predicts on the input data.
Args:
task: A `SentencePredictionTask` object.
params: A `cfg.DataConfig` object.
model: A keras.Model.
params_aug: A `cfg.DataConfig` object for augmented data.
test_time_aug_wgt: Test time augmentation weight. The prediction score will
use (1. - test_time_aug_wgt) original prediction plus test_time_aug_wgt
augmented prediction.
Returns:
A list of predictions with length of `num_examples`. For regression task,
each element in the list is the predicted score; for classification task,
each element is the predicted class id.
"""
def predict_step(inputs):
"""Replicated prediction calculation."""
x = inputs
example_id = x.pop('example_id')
outputs = task.inference_step(x, model)
return dict(example_id=example_id, predictions=outputs)
def aggregate_fn(state, outputs):
"""Concatenates model's outputs."""
if state is None:
state = []
for per_replica_example_id, per_replica_batch_predictions in zip(
outputs['example_id'], outputs['predictions']):
state.extend(zip(per_replica_example_id, per_replica_batch_predictions))
return state
dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
task.build_inputs, params)
outputs = utils.predict(predict_step, aggregate_fn, dataset)
# When running on TPU POD, the order of output cannot be maintained,
# so we need to sort by example_id.
outputs = sorted(outputs, key=lambda x: x[0])
is_regression = task.task_config.model.num_classes == 1
if params_aug is not None:
dataset_aug = orbit.utils.make_distributed_dataset(
tf.distribute.get_strategy(), task.build_inputs, params_aug)
outputs_aug = utils.predict(predict_step, aggregate_fn, dataset_aug)
outputs_aug = sorted(outputs_aug, key=lambda x: x[0])
if is_regression:
return [(1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1]
for x, y in zip(outputs, outputs_aug)]
else:
return [
tf.argmax(
(1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1],
axis=-1) for x, y in zip(outputs, outputs_aug)
]
if is_regression:
return [x[1] for x in outputs]
else:
return [tf.argmax(x[1], axis=-1) for x in outputs]
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A customized training library for the specific task."""
from absl import app
from absl import flags
import gin
from official.common import distribute_utils
from official.common import flags as tfm_flags
from official.core import task_factory
from official.core import train_lib
from official.core import train_utils
from official.modeling import performance
from official.projects.longformer import longformer_experiments
FLAGS = flags.FLAGS
def main(_):
gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
params = train_utils.parse_configuration(FLAGS)
model_dir = FLAGS.model_dir
if 'train' in FLAGS.mode:
# Pure eval modes do not output yaml files. Otherwise continuous eval job
# may race against the train job for writing the same file.
train_utils.serialize_config(params, model_dir)
# Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
# can have significant impact on model speeds by utilizing float16 in case of
# GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
# dtype is float16
if params.runtime.mixed_precision_dtype:
performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
distribution_strategy = distribute_utils.get_distribution_strategy(
distribution_strategy=params.runtime.distribution_strategy,
all_reduce_alg=params.runtime.all_reduce_alg,
num_gpus=params.runtime.num_gpus,
tpu_address=params.runtime.tpu,
**params.runtime.model_parallelism())
with distribution_strategy.scope():
task = task_factory.get_task(params.task, logging_dir=model_dir)
train_lib.run_experiment(
distribution_strategy=distribution_strategy,
task=task,
mode=FLAGS.mode,
params=params,
model_dir=model_dir)
train_utils.save_gin_config(FLAGS.mode, model_dir)
if __name__ == '__main__':
tfm_flags.define_flags()
app.run(main)
import transformers
pretrained_lm = "allenai/longformer-base-4096"
model = transformers.AutoModel.from_pretrained(pretrained_lm)
import pickle
pickle.dump({
n: p.data.numpy()
for n, p in model.named_parameters()}, open(f"{pretrained_lm.replace('/', '_')}.pk", "wb"))
\ No newline at end of file
import os
import tensorflow as tf
import transformers
import datasets
from convert_to_tf_record import file_based_convert_examples_to_features
pretrained_lm = "allenai/longformer-base-4096"
task_name = "mnli"
save_path = "./"
raw_datasets = datasets.load_dataset("glue", task_name, cache_dir=None)
label_list = raw_datasets["train"].features["label"].names
num_labels = len(label_list)
tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained_lm,
use_fast=True,
)
task_to_keys = {
"cola": ("sentence", None),
"mnli": ("premise", "hypothesis"),
"mrpc": ("sentence1", "sentence2"),
"qnli": ("question", "sentence"),
"qqp": ("question1", "question2"),
"rte": ("sentence1", "sentence2"),
"sst2": ("sentence", None),
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
sentence1_key, sentence2_key = task_to_keys[task_name]
padding = "max_length"
# make sure this is the same with model input size.
max_seq_length = 512
def preprocess_function(examples):
# Tokenize the texts
args = (
(examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
)
result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)
return result
raw_datasets = raw_datasets.map(
preprocess_function,
batched=True,
desc="Running tokenizer on dataset",
)
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["validation_matched" if task_name == "mnli" else "validation"]
print("train_dataset", train_dataset[0])
print("eval_dataset", eval_dataset[0])
def file_based_convert_examples_to_features(examples,
output_file):
"""Convert a set of `InputExample`s to a TFRecord file."""
tf.io.gfile.makedirs(os.path.dirname(output_file))
writer = tf.io.TFRecordWriter(output_file)
for ex_index, example in enumerate(examples):
if ex_index % 10000 == 0:
print(f"Writing example {ex_index} of {len(examples)}")
def create_int_feature(values):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f
def create_float_feature(values):
f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
return f
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(example["input_ids"])
features["input_mask"] = create_int_feature(example["attention_mask"])
features["segment_ids"] = create_int_feature([0] * len(example["attention_mask"]))
features["label_ids"] = create_int_feature([example["label"]])
features["is_real_example"] = create_int_feature([1])
features["example_id"] = create_int_feature([example["idx"]])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
writer.close()
file_based_convert_examples_to_features(train_dataset, os.path.join(save_path, f"{pretrained_lm.replace('/', '_')}_train.tf_record"))
file_based_convert_examples_to_features(eval_dataset, os.path.join(save_path, f"{pretrained_lm.replace('/', '_')}_eval.tf_record"))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment