"configs/eval_codeagent.py" did not exist on "ac3a2c4501eae2c9ef98fe88244944b70a322eaa"
Unverified Commit 8b641b13 authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'tensorflow:master' into panoptic-deeplab

parents 7cffacfe 357fa547
task:
hub_module_url: ''
model:
num_classes: 3
encoder:
type: any
any:
max_position_embeddings: 512
attention_window: [32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32]
global_attention_size: 1
metric_type: 'accuracy'
train_data:
drop_remainder: true
global_batch_size: 32
input_path: TODO
is_training: true
seq_length: 128
validation_data:
drop_remainder: true
global_batch_size: 32
input_path: TODO
is_training: false
seq_length: 128
trainer:
checkpoint_interval: 1000
continuous_eval_timeout: 7200
optimizer_config:
learning_rate:
polynomial:
decay_steps: 61359
end_learning_rate: 0.0
initial_learning_rate: 3.0e-05
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 6136
type: polynomial
steps_per_loop: 100
summary_interval: 100
# Training data size 392,702 examples, 5 epochs.
train_steps: 61359
validation_interval: 2000
validation_steps: 307
task:
hub_module_url: ''
model:
num_classes: 3
encoder:
type: any
any:
max_position_embeddings: 4098
attention_window: [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128]
global_attention_size: 1
vocab_size: 50265
metric_type: 'accuracy'
train_data:
drop_remainder: true
global_batch_size: 32
input_path: TODO
is_training: true
seq_length: 512
validation_data:
drop_remainder: true
global_batch_size: 32
input_path: TODO
is_training: false
seq_length: 512
trainer:
checkpoint_interval: 1000
continuous_eval_timeout: 7200
optimizer_config:
learning_rate:
polynomial:
decay_steps: 61359
end_learning_rate: 0.0
initial_learning_rate: 3.0e-05
power: 1.0
type: polynomial
optimizer:
type: adamw
warmup:
polynomial:
power: 1
warmup_steps: 6136
type: polynomial
steps_per_loop: 1000
summary_interval: 1000
# Training data size 392,702 examples, 5 epochs.
train_steps: 61359
validation_interval: 2000
validation_steps: 307
This diff is collapsed.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Longformer model configurations and instantiation methods."""
import dataclasses
from typing import List
import tensorflow as tf
from official.modeling import tf_utils
from official.modeling.hyperparams import base_config
from official.nlp.configs import encoders
from official.projects.longformer.longformer_encoder import LongformerEncoder
@dataclasses.dataclass
class LongformerEncoderConfig(encoders.BertEncoderConfig):
"""Extra paramerters for Longformer configs.
Attributes:
attention_window: list of ints representing the window size for each layer.
global_attention_size: the size of global attention used for each token.
pad_token_id: the token id for the pad token
"""
attention_window: List[int] = dataclasses.field(default_factory=list)
global_attention_size: int = 0
pad_token_id: int = 1
@base_config.bind(LongformerEncoderConfig)
def get_encoder(encoder_cfg: LongformerEncoderConfig):
"""Gets a 'LongformerEncoder' object.
Args:
encoder_cfg: A 'LongformerEncoderConfig'.
Returns:
A encoder object.
"""
encoder = LongformerEncoder(
attention_window=encoder_cfg.attention_window,
global_attention_size=encoder_cfg.global_attention_size,
vocab_size=encoder_cfg.vocab_size,
hidden_size=encoder_cfg.hidden_size,
num_layers=encoder_cfg.num_layers,
num_attention_heads=encoder_cfg.num_attention_heads,
inner_dim=encoder_cfg.intermediate_size,
inner_activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
output_dropout=encoder_cfg.dropout_rate,
attention_dropout=encoder_cfg.attention_dropout_rate,
max_sequence_length=encoder_cfg.max_position_embeddings,
type_vocab_size=encoder_cfg.type_vocab_size,
initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range),
output_range=encoder_cfg.output_range,
embedding_width=encoder_cfg.embedding_size,
norm_first=encoder_cfg.norm_first)
return encoder
This diff is collapsed.
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.projects.longformer.longformer_attention."""
import numpy as np
import tensorflow as tf
from official.modeling.tf_utils import get_shape_list
from official.projects.longformer import longformer_attention
def _create_mock_attention_data(num_heads,
key_dim,
value_dim,
q_seq_length,
kv_seq_length,
batch_size,
include_mask=False):
"""Creates mock testing data.
Args:
num_heads: `int`, Number of attention heads.
key_dim: `int`, Size of query head.
value_dim: `int`, Size of key, value dim.
q_seq_length: `int`, query sequence length of the input.
kv_seq_length: `int`, key, value sequence length of the input.
batch_size: `int`, the batch size.
include_mask: optional `bool`, whether or not to include mask data.
Returns:
A dictionary with `str` as keys and `Tensor` as values.
"""
query_shape = (batch_size, q_seq_length, key_dim)
value_shape = (batch_size, kv_seq_length, value_dim)
data = dict(
query=tf.random.normal(shape=query_shape),
value=tf.random.normal(shape=value_shape),
key=tf.random.normal(shape=value_shape))
total_seq_length = kv_seq_length
if include_mask:
mask_shape = (batch_size, num_heads, q_seq_length, total_seq_length)
mask_data = np.random.randint(2, size=mask_shape).astype('float32')
mask_data = dict(attention_mask=mask_data)
data.update(mask_data)
return data
class LongformerAttentionTest(tf.test.TestCase):
def setUp(self):
super(LongformerAttentionTest, self).setUp()
np.random.seed(0)
tf.random.set_seed(0)
def _get_hidden_states(self):
return tf.convert_to_tensor(
[[
[
4.98332758e-01,
2.69175139e00,
-7.08081422e-03,
1.04915401e00,
-1.83476661e00,
7.67220476e-01,
2.98580543e-01,
2.84803992e-02,
],
[
-7.58357372e-01,
4.20635998e-01,
-4.04739919e-02,
1.59924145e-01,
2.05135748e00,
-1.15997978e00,
5.37166397e-01,
2.62873606e-01,
],
[
-1.69438001e00,
4.17574660e-01,
-1.49196962e00,
-1.76483717e00,
-1.94566312e-01,
-1.71183858e00,
7.72903565e-01,
-1.11557056e00,
],
[
5.44028163e-01,
2.05466114e-01,
-3.63045868e-01,
2.41865062e-01,
3.20348382e-01,
-9.05611176e-01,
-1.92690727e-01,
-1.19917547e00,
],
]],
dtype=tf.float32,
)
def test_diagonalize(self):
hidden_states = self._get_hidden_states()
hidden_states = tf.reshape(hidden_states,
(1, 8, 4)) # set seq length = 8, hidden dim = 4
chunked_hidden_states = longformer_attention.LongformerAttention._chunk(
hidden_states, window_overlap=2)
window_overlap_size = get_shape_list(chunked_hidden_states)[2]
self.assertEqual(window_overlap_size, 4)
padded_hidden_states = longformer_attention.LongformerAttention._pad_and_diagonalize(
chunked_hidden_states)
self.assertEqual(
get_shape_list(padded_hidden_states)[-1],
get_shape_list(chunked_hidden_states)[-1] + window_overlap_size - 1)
# first row => [0.4983, 2.6918, -0.0071, 1.0492, 0.0000, 0.0000, 0.0000]
tf.debugging.assert_near(
padded_hidden_states[0, 0, 0, :4],
chunked_hidden_states[0, 0, 0],
rtol=1e-3)
tf.debugging.assert_near(
padded_hidden_states[0, 0, 0, 4:],
tf.zeros((3,), dtype=tf.dtypes.float32),
rtol=1e-3)
# last row => [0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629]
tf.debugging.assert_near(
padded_hidden_states[0, 0, -1, 3:],
chunked_hidden_states[0, 0, -1],
rtol=1e-3)
tf.debugging.assert_near(
padded_hidden_states[0, 0, -1, :3],
tf.zeros((3,), dtype=tf.dtypes.float32),
rtol=1e-3)
def test_pad_and_transpose_last_two_dims(self):
hidden_states = self._get_hidden_states()
self.assertTrue(get_shape_list(hidden_states), [1, 8, 4])
# pad along seq length dim
paddings = tf.constant([[0, 0], [0, 0], [0, 1], [0, 0]],
dtype=tf.dtypes.int32)
hidden_states = longformer_attention.LongformerAttention._chunk(
hidden_states, window_overlap=2)
padded_hidden_states = longformer_attention.LongformerAttention._pad_and_transpose_last_two_dims(
hidden_states, paddings)
self.assertEqual(get_shape_list(padded_hidden_states), [1, 1, 8, 5])
expected_added_dim = tf.zeros((5,), dtype=tf.dtypes.float32)
tf.debugging.assert_near(
expected_added_dim, padded_hidden_states[0, 0, -1, :], rtol=1e-6)
tf.debugging.assert_near(
hidden_states[0, 0, -1, :],
tf.reshape(padded_hidden_states, (1, -1))[0, 24:32],
rtol=1e-6)
def test_mask_invalid_locations(self):
hidden_states = self._get_hidden_states()
batch_size = 1
seq_length = 8
hidden_size = 4
hidden_states = tf.reshape(hidden_states,
(batch_size, seq_length, hidden_size))
hidden_states = longformer_attention.LongformerAttention._chunk(
hidden_states, window_overlap=2)
hid_states_1 = longformer_attention.LongformerAttention._mask_invalid_locations(
hidden_states, 1)
hid_states_2 = longformer_attention.LongformerAttention._mask_invalid_locations(
hidden_states, 2)
hid_states_3 = longformer_attention.LongformerAttention._mask_invalid_locations(
hidden_states[:, :, :, :3], 2)
hid_states_4 = longformer_attention.LongformerAttention._mask_invalid_locations(
hidden_states[:, :, 2:, :], 2)
self.assertEqual(
tf.math.reduce_sum(
tf.cast(tf.math.is_inf(hid_states_1), tf.dtypes.int32)), 8)
self.assertEqual(
tf.math.reduce_sum(
tf.cast(tf.math.is_inf(hid_states_2), tf.dtypes.int32)), 24)
self.assertEqual(
tf.math.reduce_sum(
tf.cast(tf.math.is_inf(hid_states_3), tf.dtypes.int32)), 24)
self.assertEqual(
tf.math.reduce_sum(
tf.cast(tf.math.is_inf(hid_states_4), tf.dtypes.int32)), 12)
def test_chunk(self):
hidden_states = self._get_hidden_states()
batch_size = 1
seq_length = 8
hidden_size = 4
hidden_states = tf.reshape(hidden_states,
(batch_size, seq_length, hidden_size))
chunked_hidden_states = longformer_attention.LongformerAttention._chunk(
hidden_states, window_overlap=2)
# expected slices across chunk and seq length dim
expected_slice_along_seq_length = tf.convert_to_tensor(
[0.4983, -0.7584, -1.6944], dtype=tf.dtypes.float32)
expected_slice_along_chunk = tf.convert_to_tensor(
[0.4983, -1.8348, -0.7584, 2.0514], dtype=tf.dtypes.float32)
self.assertEqual(get_shape_list(chunked_hidden_states), [1, 3, 4, 4])
tf.debugging.assert_near(
chunked_hidden_states[0, :, 0, 0],
expected_slice_along_seq_length,
rtol=1e-3)
tf.debugging.assert_near(
chunked_hidden_states[0, 0, :, 0],
expected_slice_along_chunk,
rtol=1e-3)
def test_layer_local_attn(self):
hidden_states = self._get_hidden_states()
batch_size, seq_length, _ = hidden_states.shape
layer = longformer_attention.LongformerAttention(
num_heads=2,
key_dim=4,
value_dim=4,
layer_id=0,
attention_window=4,
global_attention_size=0,
)
attention_mask = tf.zeros((batch_size, seq_length), dtype=tf.dtypes.float32)
is_index_global_attn = tf.math.greater(attention_mask, 1)
attention_mask = tf.where(
tf.range(4)[None, :, None, None] > 1, -10000.0,
attention_mask[:, :, None, None])
is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0)
output_hidden_states = layer(
hidden_states=hidden_states,
attention_mask=attention_mask,
is_index_masked=is_index_masked,
is_index_global_attn=is_index_global_attn,
)[0]
self.assertTrue(output_hidden_states.shape, (1, 4, 8))
def test_layer_global_attn(self):
layer = longformer_attention.LongformerAttention(
num_heads=2,
key_dim=4,
value_dim=4,
layer_id=0,
attention_window=4,
global_attention_size=1,
)
hidden_states = self._get_hidden_states()
hidden_states = tf.concat(
[self._get_hidden_states(),
self._get_hidden_states() - 0.5], axis=0)
_, seq_length, _ = hidden_states.shape
# create attn mask
attention_mask_1 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32)
attention_mask_2 = tf.zeros((1, 1, 1, seq_length), dtype=tf.dtypes.float32)
attention_mask_1 = tf.where(
tf.range(4)[None, :, None, None] == 0, 10000.0, attention_mask_1)
attention_mask_1 = tf.where(
tf.range(4)[None, :, None, None] > 2, -10000.0, attention_mask_1)
attention_mask_2 = tf.where(
tf.range(4)[None, :, None, None] == 0, 10000.0, attention_mask_2)
attention_mask = tf.concat([attention_mask_1, attention_mask_2], axis=0)
is_index_masked = tf.math.less(attention_mask[:, :, 0, 0], 0)
is_index_global_attn = tf.math.greater(attention_mask[:, :, 0, 0], 0)
output_hidden_states = layer(
hidden_states=hidden_states,
attention_mask=-tf.math.abs(attention_mask),
is_index_masked=is_index_masked,
is_index_global_attn=is_index_global_attn,
)[0]
self.assertTrue(output_hidden_states.shape, (2, 4, 8))
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Longformer encoder. Modified From huggingface/transformers."""
# pylint: disable=g-classes-have-attributes
from typing import Any, Callable, List, Optional, Union
from absl import logging
import tensorflow as tf
from official.modeling.tf_utils import get_shape_list
from official.nlp.modeling import layers
from official.projects.longformer.longformer_encoder_block import LongformerEncoderBlock
_Initializer = Union[str, tf.keras.initializers.Initializer]
_approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
class LongformerEncoder(tf.keras.layers.Layer):
"""LongformerEncoder.
Args:
vocab_size: The size of the token vocabulary.
attention_window: list of ints representing the window size for each layer.
global_attention_size: the size of global attention used for each token.
pad_token_id: the token id for the pad token
hidden_size: The size of the transformer hidden layers.
num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
type_vocab_size: The number of types that the 'type_ids' input can take.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network for each transformer.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network for each transformer.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: The dropout rate to use for the attention layers within
the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yields the full
output.
embedding_width: The width of the word embeddings. If the embedding width is
not equal to hidden size, embedding parameters will be factorized into two
matrices in the shape of ['vocab_size', 'embedding_width'] and
['embedding_width', 'hidden_size'] ('embedding_width' is usually much
smaller than 'hidden_size').
embedding_layer: An optional Layer instance which will be called to generate
embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate dense
layers. If set False, output of attention and intermediate dense layers is
normalized.
"""
def __init__(
self,
vocab_size: int,
attention_window: Union[List[int], int] = 512,
global_attention_size: int = 0,
pad_token_id: int = 1,
hidden_size: int = 768,
num_layers: int = 12,
num_attention_heads: int = 12,
max_sequence_length: int = 512,
type_vocab_size: int = 16,
inner_dim: int = 3072,
inner_activation: Callable[..., Any] = _approx_gelu,
output_dropout: float = 0.1,
attention_dropout: float = 0.1,
initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
stddev=0.02),
output_range: Optional[int] = None,
embedding_width: Optional[int] = None,
embedding_layer: Optional[tf.keras.layers.Layer] = None,
norm_first: bool = False,
**kwargs):
super().__init__(**kwargs)
# Longformer args
self._attention_window = attention_window
self._global_attention_size = global_attention_size
self._pad_token_id = pad_token_id
activation = tf.keras.activations.get(inner_activation)
initializer = tf.keras.initializers.get(initializer)
if embedding_width is None:
embedding_width = hidden_size
if embedding_layer is None:
self._embedding_layer = layers.OnDeviceEmbedding(
vocab_size=vocab_size,
embedding_width=embedding_width,
initializer=initializer,
name='word_embeddings')
else:
self._embedding_layer = embedding_layer
self._position_embedding_layer = layers.PositionEmbedding(
initializer=initializer,
max_length=max_sequence_length,
name='position_embedding')
self._type_embedding_layer = layers.OnDeviceEmbedding(
vocab_size=type_vocab_size,
embedding_width=embedding_width,
initializer=initializer,
use_one_hot=True,
name='type_embeddings')
self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
self._embedding_dropout = tf.keras.layers.Dropout(
rate=output_dropout, name='embedding_dropout')
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
self._embedding_projection = None
if embedding_width != hidden_size:
self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
'...x,xy->...y',
output_shape=hidden_size,
bias_axes='y',
kernel_initializer=initializer,
name='embedding_projection')
self._transformer_layers = []
self._attention_mask_layer = layers.SelfAttentionMask(
name='self_attention_mask')
for i in range(num_layers):
layer = LongformerEncoderBlock(
global_attention_size=global_attention_size,
num_attention_heads=num_attention_heads,
inner_dim=inner_dim,
inner_activation=inner_activation,
attention_window=attention_window[i],
layer_id=i,
output_dropout=output_dropout,
attention_dropout=attention_dropout,
norm_first=norm_first,
output_range=output_range if i == num_layers - 1 else None,
kernel_initializer=initializer,
name=f'transformer/layer_{i}')
self._transformer_layers.append(layer)
self._pooler_layer = tf.keras.layers.Dense(
units=hidden_size,
activation='tanh',
kernel_initializer=initializer,
name='pooler_transform')
self._config = {
'vocab_size': vocab_size,
'hidden_size': hidden_size,
'num_layers': num_layers,
'num_attention_heads': num_attention_heads,
'max_sequence_length': max_sequence_length,
'type_vocab_size': type_vocab_size,
'inner_dim': inner_dim,
'inner_activation': tf.keras.activations.serialize(activation),
'output_dropout': output_dropout,
'attention_dropout': attention_dropout,
'initializer': tf.keras.initializers.serialize(initializer),
'output_range': output_range,
'embedding_width': embedding_width,
'embedding_layer': embedding_layer,
'norm_first': norm_first,
'attention_window': attention_window,
'global_attention_size': global_attention_size,
'pad_token_id': pad_token_id,
}
self.inputs = dict(
input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32))
def call(self, inputs):
word_embeddings = None
if isinstance(inputs, dict):
word_ids = inputs.get('input_word_ids') # input_ids
mask = inputs.get('input_mask') # attention_mask
type_ids = inputs.get('input_type_ids') # token_type_ids
word_embeddings = inputs.get('input_word_embeddings',
None) # input_embeds
else:
raise ValueError(f'Unexpected inputs type to {self.__class__}.')
(
padding_len,
word_ids,
mask,
type_ids,
word_embeddings,
) = self._pad_to_window_size(
word_ids=word_ids,
mask=mask,
type_ids=type_ids,
word_embeddings=word_embeddings,
pad_token_id=self._pad_token_id)
if word_embeddings is None:
word_embeddings = self._embedding_layer(word_ids)
# absolute position embeddings.
position_embeddings = self._position_embedding_layer(word_embeddings)
type_embeddings = self._type_embedding_layer(type_ids)
embeddings = word_embeddings + position_embeddings + type_embeddings
embeddings = self._embedding_norm_layer(embeddings)
embeddings = self._embedding_dropout(embeddings)
if self._embedding_projection is not None:
embeddings = self._embedding_projection(embeddings)
batch_size, seq_len = get_shape_list(mask)
# create masks with fixed len global_attention_size
mask = tf.transpose(
tf.concat(
values=[
tf.ones(
(self._global_attention_size, batch_size), tf.int32) * 2,
tf.transpose(mask)[self._global_attention_size:]
],
axis=0))
is_index_masked = tf.math.less(mask, 1)
is_index_global_attn = tf.transpose(
tf.concat(
values=[
tf.ones((self._global_attention_size, batch_size), tf.bool),
tf.zeros((seq_len - self._global_attention_size, batch_size),
tf.bool)
],
axis=0))
# Longformer
attention_mask = mask
extended_attention_mask = tf.reshape(
attention_mask, (tf.shape(mask)[0], tf.shape(mask)[1], 1, 1))
attention_mask = tf.cast(
tf.math.abs(1 - extended_attention_mask), tf.dtypes.float32) * -10000.0
encoder_outputs = []
x = embeddings
# TFLongformerEncoder
for layer in self._transformer_layers:
x = layer([x, attention_mask, is_index_masked, is_index_global_attn])
encoder_outputs.append(x)
last_encoder_output = encoder_outputs[-1]
if padding_len > 0:
last_encoder_output = last_encoder_output[:, :-padding_len]
first_token_tensor = last_encoder_output[:, 0, :]
pooled_output = self._pooler_layer(first_token_tensor)
return dict(
sequence_output=last_encoder_output,
pooled_output=pooled_output,
encoder_outputs=encoder_outputs)
def get_embedding_table(self):
return self._embedding_layer.embeddings
def get_embedding_layer(self):
return self._embedding_layer
def get_config(self):
return dict(self._config)
@property
def transformer_layers(self):
"""List of Transformer layers in the encoder."""
return self._transformer_layers
@property
def pooler_layer(self):
"""The pooler dense layer after the transformer layers."""
return self._pooler_layer
@classmethod
def from_config(cls, config, custom_objects=None):
if 'embedding_layer' in config and config['embedding_layer'] is not None:
warn_string = (
'You are reloading a model that was saved with a '
'potentially-shared embedding layer object. If you contine to '
'train this model, the embedding layer will no longer be shared. '
'To work around this, load the model outside of the Keras API.')
print('WARNING: ' + warn_string)
logging.warn(warn_string)
return cls(**config)
def _pad_to_window_size(
self,
word_ids,
mask,
type_ids,
word_embeddings,
pad_token_id,
):
# padding
attention_window = max(self._attention_window)
assert (attention_window %
2 == 0), ('`attention_window` should be an even value.'
f'Given {attention_window}')
input_shape = get_shape_list(
word_ids) if word_ids is not None else get_shape_list(word_embeddings)
batch_size, seq_len = input_shape[:2]
if seq_len is not None:
padding_len = (attention_window -
seq_len % attention_window) % attention_window
else:
padding_len = 0
paddings = tf.convert_to_tensor([[0, 0], [0, padding_len]])
if word_ids is not None:
word_ids = tf.pad(word_ids, paddings, constant_values=pad_token_id)
if word_embeddings is not None:
def pad_embeddings():
word_ids_padding = tf.fill((batch_size, padding_len), self.pad_token_id)
word_embeddings_padding = self._embedding_layer(word_ids_padding)
return tf.concat([word_embeddings, word_embeddings_padding], axis=-2)
word_embeddings = tf.cond(
tf.math.greater(padding_len, 0), pad_embeddings,
lambda: word_embeddings)
mask = tf.pad(
mask, paddings,
constant_values=False) # no attention on the padding tokens
token_type_ids = tf.pad(
type_ids, paddings, constant_values=0) # pad with token_type_id = 0
return (
padding_len,
word_ids,
mask,
token_type_ids,
word_embeddings,
)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Longformer attention layer. Modified From huggingface/transformers."""
import tensorflow as tf
from official.projects.longformer.longformer_attention import LongformerAttention
@tf.keras.utils.register_keras_serializable(package="Text")
class LongformerEncoderBlock(tf.keras.layers.Layer):
"""LongformerEncoderBlock.
Args:
num_attention_heads: Number of attention heads.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network.
output_range: the sequence output range, [0, output_range) for slicing the
target sequence. `None` means the target sequence is not sliced.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
use_bias: Whether to enable use_bias in attention layer. If set False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: Dropout probability for within the attention layer.
inner_dropout: Dropout probability for the first Dense layer in a
two-layer feedforward network.
attention_initializer: Initializer for kernels of attention layers. If set
`None`, attention layers use kernel_initializer as initializer for
kernel.
attention_axes: axes over which the attention is applied. `None` means
attention over all axes, but batch, heads, and features.
**kwargs: keyword arguments/
"""
def __init__(
self,
global_attention_size,
num_attention_heads,
inner_dim,
inner_activation,
# Longformer
attention_window,
layer_id=0,
output_range=None,
kernel_initializer="glorot_uniform",
bias_initializer="zeros",
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
use_bias=True,
norm_first=False,
norm_epsilon=1e-12,
output_dropout=0.0,
attention_dropout=0.0,
inner_dropout=0.0,
attention_initializer=None,
attention_axes=None,
**kwargs):
super().__init__(**kwargs)
self.global_attention_size = global_attention_size
self._num_heads = num_attention_heads
self._inner_dim = inner_dim
self._inner_activation = inner_activation
# Longformer
self._attention_window = attention_window
self._layer_id = layer_id
self._attention_dropout = attention_dropout
self._attention_dropout_rate = attention_dropout
self._output_dropout = output_dropout
self._output_dropout_rate = output_dropout
self._output_range = output_range
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._bias_initializer = tf.keras.initializers.get(bias_initializer)
self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint)
self._use_bias = use_bias
self._norm_first = norm_first
self._norm_epsilon = norm_epsilon
self._inner_dropout = inner_dropout
if attention_initializer:
self._attention_initializer = tf.keras.initializers.get(
attention_initializer)
else:
self._attention_initializer = self._kernel_initializer
self._attention_axes = attention_axes
def build(self, input_shape):
if isinstance(input_shape, tf.TensorShape):
input_tensor_shape = input_shape
elif isinstance(input_shape, (list, tuple)):
input_tensor_shape = tf.TensorShape(input_shape[0])
else:
raise ValueError(
f"The type of input shape argument is not supported, got: "
f"{type(input_shape)}")
einsum_equation = "abc,cd->abd"
if len(input_tensor_shape.as_list()) > 3:
einsum_equation = "...bc,cd->...bd"
hidden_size = input_tensor_shape[-1]
if hidden_size % self._num_heads != 0:
raise ValueError(
f"The input size ({hidden_size}) is not a multiple of the number of attention "
f"heads ({self._num_heads})")
self._attention_head_size = int(hidden_size // self._num_heads)
common_kwargs = dict(
bias_initializer=self._bias_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint)
# TFLongformerSelfAttention + TFLongformerSelfOutput.dense
self._attention_layer = LongformerAttention(
# Longformer
layer_id=self._layer_id,
global_attention_size=self.global_attention_size,
attention_window=self._attention_window,
num_heads=self._num_heads,
key_dim=self._attention_head_size,
dropout=self._attention_dropout,
use_bias=self._use_bias,
kernel_initializer=self._attention_initializer,
attention_axes=self._attention_axes,
name="self_attention",
**common_kwargs)
# TFLongformerSelfOutput.dropout
self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
# TFLongformerSelfOutput.Layernorm
self._attention_layer_norm = (
tf.keras.layers.LayerNormalization(
name="self_attention_layer_norm",
axis=-1,
epsilon=self._norm_epsilon,
dtype=tf.float32))
# TFLongformerIntermediate
# TFLongformerIntermediate.dense
self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
einsum_equation,
output_shape=(None, self._inner_dim),
bias_axes="d",
kernel_initializer=self._kernel_initializer,
name="intermediate",
**common_kwargs)
policy = tf.keras.mixed_precision.global_policy()
if policy.name == "mixed_bfloat16":
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
# TODO(b/154538392): Investigate this.
policy = tf.float32
# TFLongformerIntermediate.intermediate_act_fn
self._intermediate_activation_layer = tf.keras.layers.Activation(
self._inner_activation, dtype=policy)
self._inner_dropout_layer = tf.keras.layers.Dropout(
rate=self._inner_dropout)
# TFLongformerOutput
# TFLongformerOutput.dense
self._output_dense = tf.keras.layers.experimental.EinsumDense(
einsum_equation,
output_shape=(None, hidden_size),
bias_axes="d",
name="output",
kernel_initializer=self._kernel_initializer,
**common_kwargs)
# TFLongformerOutput.dropout
self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
# Use float32 in layernorm for numeric stability.
# TFLongformerOutput.layernorm
self._output_layer_norm = tf.keras.layers.LayerNormalization(
name="output_layer_norm",
axis=-1,
epsilon=self._norm_epsilon,
dtype=tf.float32)
super().build(input_shape)
def get_config(self):
config = {
"num_attention_heads":
self._num_heads,
"inner_dim":
self._inner_dim,
"inner_activation":
self._inner_activation,
"output_dropout":
self._output_dropout_rate,
"attention_dropout":
self._attention_dropout_rate,
"output_range":
self._output_range,
"kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer":
tf.keras.initializers.serialize(self._bias_initializer),
"kernel_regularizer":
tf.keras.regularizers.serialize(self._kernel_regularizer),
"bias_regularizer":
tf.keras.regularizers.serialize(self._bias_regularizer),
"activity_regularizer":
tf.keras.regularizers.serialize(self._activity_regularizer),
"kernel_constraint":
tf.keras.constraints.serialize(self._kernel_constraint),
"bias_constraint":
tf.keras.constraints.serialize(self._bias_constraint),
"use_bias":
self._use_bias,
"norm_first":
self._norm_first,
"norm_epsilon":
self._norm_epsilon,
"inner_dropout":
self._inner_dropout,
"attention_initializer":
tf.keras.initializers.serialize(self._attention_initializer),
"attention_axes":
self._attention_axes,
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
"""Transformer self-attention encoder block call.
Args:
inputs: a single tensor or a list of tensors. `input tensor` as the single
sequence of embeddings. [`input tensor`, `attention mask`] to have the
additional attention mask. [`query tensor`, `key value tensor`,
`attention mask`] to have separate input streams for the query, and
key/value to the multi-head attention.
Returns:
An output tensor with the same dimensions as input/query tensor.
"""
if isinstance(inputs, (list, tuple)):
if len(inputs) == 4:
(
input_tensor,
attention_mask,
is_index_masked,
is_index_global_attn,
) = inputs
key_value = None
elif len(inputs) == 5:
assert False # No key_value
else:
raise ValueError(
f"Unexpected inputs to {self.__class__} with length at {len(inputs)}"
)
else:
input_tensor = inputs
attention_mask = None
is_index_masked = None
is_index_global_attn = None
key_value = None
if self._output_range:
if self._norm_first:
source_tensor = input_tensor[:, 0:self._output_range, :]
input_tensor = self._attention_layer_norm(input_tensor)
if key_value is not None:
key_value = self._attention_layer_norm(key_value)
target_tensor = input_tensor[:, 0:self._output_range, :]
if attention_mask is not None:
attention_mask = attention_mask[:, 0:self._output_range, :]
if is_index_masked is not None:
is_index_masked = is_index_masked[:, 0:self._output_range]
if is_index_global_attn is not None:
is_index_global_attn = is_index_global_attn[:, 0:self._output_range]
else:
if self._norm_first:
source_tensor = input_tensor
input_tensor = self._attention_layer_norm(input_tensor)
if key_value is not None:
key_value = self._attention_layer_norm(key_value)
target_tensor = input_tensor
if key_value is None:
key_value = input_tensor
attention_output = self._attention_layer(
hidden_states=target_tensor,
attention_mask=attention_mask,
is_index_masked=is_index_masked,
is_index_global_attn=is_index_global_attn,
)
# TFLongformerAttention.TFLongformerSelfOutput.* - {.dense}
attention_output = self._attention_dropout(attention_output)
if self._norm_first:
attention_output = source_tensor + attention_output
else:
attention_output = self._attention_layer_norm(target_tensor +
attention_output)
if self._norm_first:
source_attention_output = attention_output
attention_output = self._output_layer_norm(attention_output)
# TFLongformerIntermediate
inner_output = self._intermediate_dense(attention_output)
inner_output = self._intermediate_activation_layer(inner_output)
inner_output = self._inner_dropout_layer(inner_output)
# TFLongformerOutput
layer_output = self._output_dense(inner_output)
layer_output = self._output_dropout(layer_output)
if self._norm_first:
return source_attention_output + layer_output
# During mixed precision training, layer norm output is always fp32 for now.
# Casts fp32 for the subsequent add.
layer_output = tf.cast(layer_output, tf.float32)
return self._output_layer_norm(layer_output + attention_output)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for official.nlp.projects.longformer.longformer_encoder."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from official.projects.longformer.longformer_encoder import LongformerEncoder
class LongformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
def setUp(self):
super(LongformerEncoderTest, self).setUp()
np.random.seed(0)
tf.random.set_seed(0)
@combinations.generate(
combinations.combine(
attention_window=[32, 128], global_attention_size=[0, 1, 2]))
def test_encoder(self, attention_window, global_attention_size):
sequence_length = 128
batch_size = 2
vocab_size = 1024
hidden_size = 256
network = LongformerEncoder(
global_attention_size=global_attention_size,
vocab_size=vocab_size,
attention_window=[attention_window],
hidden_size=hidden_size,
num_layers=1,
num_attention_heads=4,
max_sequence_length=512)
word_id_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length), dtype=np.int32)
mask_data = np.random.randint(
2, size=(batch_size, sequence_length), dtype=np.int32)
type_id_data = np.random.randint(
2, size=(batch_size, sequence_length), dtype=np.int32)
inputs = {
'input_word_ids': word_id_data,
'input_mask': mask_data,
'input_type_ids': type_id_data,
}
outputs = network(inputs)
self.assertEqual(outputs['sequence_output'].shape,
(batch_size, sequence_length, hidden_size))
@combinations.generate(
combinations.combine(
norm_first=[True, False], global_attention_size=[0, 1, 2]))
def test_norm_first(self, norm_first, global_attention_size):
sequence_length = 128
batch_size = 2
vocab_size = 1024
hidden_size = 256
network = LongformerEncoder(
global_attention_size=global_attention_size,
vocab_size=vocab_size,
attention_window=[32],
hidden_size=hidden_size,
num_layers=1,
num_attention_heads=4,
max_sequence_length=512,
norm_first=norm_first)
word_id_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length), dtype=np.int32)
mask_data = np.random.randint(
2, size=(batch_size, sequence_length), dtype=np.int32)
type_id_data = np.random.randint(
2, size=(batch_size, sequence_length), dtype=np.int32)
inputs = {
'input_word_ids': word_id_data,
'input_mask': mask_data,
'input_type_ids': type_id_data,
}
outputs = network(inputs)
self.assertEqual(outputs['sequence_output'].shape,
(batch_size, sequence_length, hidden_size))
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Longformer experiments."""
# pylint: disable=g-doc-return-or-yield,line-too-long
import dataclasses
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.modeling import optimization
from official.nlp.configs import bert
from official.nlp.configs import encoders
from official.nlp.data import pretrain_dataloader
from official.nlp.data import sentence_prediction_dataloader
from official.nlp.tasks import masked_lm
from official.nlp.tasks import sentence_prediction
from official.projects.longformer.longformer import LongformerEncoderConfig
AdamWeightDecay = optimization.AdamWeightDecayConfig
PolynomialLr = optimization.PolynomialLrConfig
PolynomialWarmupConfig = optimization.PolynomialWarmupConfig
@dataclasses.dataclass
class LongformerOptimizationConfig(optimization.OptimizationConfig):
"""Longformer optimization configuration."""
optimizer: optimization.OptimizerConfig = optimization.OptimizerConfig(
type='adamw',
adamw=AdamWeightDecay(
weight_decay_rate=0.01,
exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
epsilon=1e-6))
learning_rate: optimization.LrConfig = optimization.LrConfig(
type='polynomial',
polynomial=PolynomialLr(
initial_learning_rate=1e-4,
decay_steps=1000000,
end_learning_rate=0.0))
warmup: optimization.WarmupConfig = optimization.WarmupConfig(
type='polynomial', polynomial=PolynomialWarmupConfig(warmup_steps=10000))
@exp_factory.register_config_factory('longformer/pretraining')
def longformer_pretraining() -> cfg.ExperimentConfig:
"""Longformer pretraining experiment."""
config = cfg.ExperimentConfig(
runtime=cfg.RuntimeConfig(enable_xla=True),
task=masked_lm.MaskedLMConfig(
model=bert.PretrainerConfig(
encoder=encoders.EncoderConfig(
type='any', any=LongformerEncoderConfig()),
cls_heads=[
bert.ClsHeadConfig(
inner_dim=768,
num_classes=2,
dropout_rate=0.1,
name='next_sentence')
]),
train_data=pretrain_dataloader.BertPretrainDataConfig(
use_v2_feature_names=True),
validation_data=pretrain_dataloader.BertPretrainDataConfig(
use_v2_feature_names=True, is_training=False)),
trainer=cfg.TrainerConfig(
optimizer_config=LongformerOptimizationConfig(), train_steps=1000000),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
@exp_factory.register_config_factory('longformer/glue')
def longformer_glue() -> cfg.ExperimentConfig:
"""Longformer glue fine-tuning."""
config = cfg.ExperimentConfig(
task=sentence_prediction.SentencePredictionConfig(
model=sentence_prediction.ModelConfig(
encoder=encoders.EncoderConfig(
type='any', any=LongformerEncoderConfig())),
train_data=sentence_prediction_dataloader
.SentencePredictionDataConfig(),
validation_data=sentence_prediction_dataloader
.SentencePredictionDataConfig(
is_training=False, drop_remainder=False)),
trainer=cfg.TrainerConfig(
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'adamw',
'adamw': {
'weight_decay_rate':
0.01,
'exclude_from_weight_decay':
['LayerNorm', 'layer_norm', 'bias'],
}
},
'learning_rate': {
'type': 'polynomial',
'polynomial': {
'initial_learning_rate': 3e-5,
'end_learning_rate': 0.0,
}
},
'warmup': {
'type': 'polynomial'
}
})),
restrictions=[
'task.train_data.is_training != None',
'task.validation_data.is_training != None'
])
return config
...@@ -12,22 +12,19 @@ ...@@ -12,22 +12,19 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Lint as: python3 """A customized training library for the specific task."""
"""TensorFlow Model Garden Vision training driver."""
from absl import app from absl import app
from absl import flags from absl import flags
import gin import gin
# pylint: disable=unused-import
from official.common import registry_imports
# pylint: enable=unused-import
from official.common import distribute_utils from official.common import distribute_utils
from official.common import flags as tfm_flags from official.common import flags as tfm_flags
from official.core import task_factory from official.core import task_factory
from official.core import train_lib from official.core import train_lib
from official.core import train_utils from official.core import train_utils
from official.modeling import performance from official.modeling import performance
from official.projects.longformer import longformer_experiments # pylint: disable=unused-import
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
...@@ -51,7 +48,9 @@ def main(_): ...@@ -51,7 +48,9 @@ def main(_):
distribution_strategy=params.runtime.distribution_strategy, distribution_strategy=params.runtime.distribution_strategy,
all_reduce_alg=params.runtime.all_reduce_alg, all_reduce_alg=params.runtime.all_reduce_alg,
num_gpus=params.runtime.num_gpus, num_gpus=params.runtime.num_gpus,
tpu_address=params.runtime.tpu) tpu_address=params.runtime.tpu,
**params.runtime.model_parallelism())
with distribution_strategy.scope(): with distribution_strategy.scope():
task = task_factory.get_task(params.task, logging_dir=model_dir) task = task_factory.get_task(params.task, logging_dir=model_dir)
...@@ -64,7 +63,7 @@ def main(_): ...@@ -64,7 +63,7 @@ def main(_):
train_utils.save_gin_config(FLAGS.mode, model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)
if __name__ == '__main__': if __name__ == '__main__':
tfm_flags.define_flags() tfm_flags.define_flags()
flags.mark_flags_as_required(['experiment', 'mode', 'model_dir'])
app.run(main) app.run(main)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Converts pre-trained pytorch checkpoint into a tf encoder checkpoint."""
import os
from absl import app
import numpy as np
import tensorflow as tf
import transformers
from official.modeling import tf_utils
from official.projects.longformer.longformer import LongformerEncoderConfig
from official.projects.longformer.longformer_encoder import LongformerEncoder
def _get_pytorch_longformer_model():
pretrained_lm = "allenai/longformer-base-4096"
model = transformers.AutoModel.from_pretrained(pretrained_lm)
return {n: p.data.numpy() for n, p in model.named_parameters()}
def _create_longformer_model():
"""Creates a Longformer model."""
encoder_cfg = LongformerEncoderConfig
encoder_cfg.vocab_size = 50265
encoder_cfg.max_position_embeddings = 4098
encoder_cfg.attention_window = [2] * encoder_cfg.num_layers
encoder_cfg.global_attention_size = 1
encoder = LongformerEncoder(
attention_window=encoder_cfg.attention_window,
global_attention_size=encoder_cfg.global_attention_size,
vocab_size=encoder_cfg.vocab_size,
hidden_size=encoder_cfg.hidden_size,
num_layers=encoder_cfg.num_layers,
num_attention_heads=encoder_cfg.num_attention_heads,
inner_dim=encoder_cfg.intermediate_size,
inner_activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
output_dropout=encoder_cfg.dropout_rate,
attention_dropout=encoder_cfg.attention_dropout_rate,
max_sequence_length=encoder_cfg.max_position_embeddings,
type_vocab_size=encoder_cfg.type_vocab_size,
initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range),
output_range=encoder_cfg.output_range,
embedding_width=encoder_cfg.embedding_size,
norm_first=encoder_cfg.norm_first)
return encoder
# pylint: disable=protected-access
def convert(encoder, allenai_model):
"""Convert AllenAI Longformer to the one in the codebase."""
num_layers = encoder._config["num_layers"]
num_attention_heads = encoder._config["num_attention_heads"]
hidden_size = encoder._config["hidden_size"]
head_size = hidden_size // num_attention_heads
assert head_size * num_attention_heads == hidden_size
encoder._embedding_layer.set_weights(
[allenai_model["embeddings.word_embeddings.weight"]])
encoder._embedding_norm_layer.set_weights([
allenai_model["embeddings.LayerNorm.weight"],
allenai_model["embeddings.LayerNorm.bias"]
])
encoder._type_embedding_layer.set_weights([
np.repeat(
allenai_model["embeddings.token_type_embeddings.weight"], 2, axis=0)
])
encoder._position_embedding_layer.set_weights(
[allenai_model["embeddings.position_embeddings.weight"]])
encoder._pooler_layer.set_weights([
allenai_model["pooler.dense.weight"], allenai_model["pooler.dense.bias"]
])
for layer_num in range(num_layers):
encoder._transformer_layers[
layer_num]._attention_layer._global_key_dense.set_weights([
allenai_model[
f"encoder.layer.{layer_num}.attention.self.key_global.weight"].T
.reshape(
(hidden_size, num_attention_heads, head_size)), allenai_model[
f"encoder.layer.{layer_num}.attention.self.key_global.bias"]
.reshape((num_attention_heads, head_size))
])
encoder._transformer_layers[
layer_num]._attention_layer._global_query_dense.set_weights([
allenai_model[
f"encoder.layer.{layer_num}.attention.self.query_global.weight"]
.T.reshape((hidden_size, num_attention_heads, head_size)),
allenai_model[
f"encoder.layer.{layer_num}.attention.self.query_global.bias"]
.reshape((num_attention_heads, head_size))
])
encoder._transformer_layers[
layer_num]._attention_layer._global_value_dense.set_weights([
allenai_model[
f"encoder.layer.{layer_num}.attention.self.value_global.weight"]
.T.reshape((hidden_size, num_attention_heads, head_size)),
allenai_model[
f"encoder.layer.{layer_num}.attention.self.value_global.bias"]
.reshape((num_attention_heads, head_size))
])
encoder._transformer_layers[
layer_num]._attention_layer._key_dense.set_weights([
allenai_model[
f"encoder.layer.{layer_num}.attention.self.key.weight"].T
.reshape(
(hidden_size, num_attention_heads, head_size)), allenai_model[
f"encoder.layer.{layer_num}.attention.self.key_global.bias"]
.reshape((num_attention_heads, head_size))
])
encoder._transformer_layers[
layer_num]._attention_layer._query_dense.set_weights([
allenai_model[
f"encoder.layer.{layer_num}.attention.self.query.weight"].T
.reshape((hidden_size, num_attention_heads, head_size)),
allenai_model[
f"encoder.layer.{layer_num}.attention.self.query.bias"].reshape(
(num_attention_heads, head_size))
])
encoder._transformer_layers[
layer_num]._attention_layer._value_dense.set_weights([
allenai_model[
f"encoder.layer.{layer_num}.attention.self.value.weight"].T
.reshape((hidden_size, num_attention_heads, head_size)),
allenai_model[
f"encoder.layer.{layer_num}.attention.self.value.bias"].reshape(
(num_attention_heads, head_size))
])
encoder._transformer_layers[
layer_num]._attention_layer._output_dense.set_weights([
allenai_model[
f"encoder.layer.{layer_num}.attention.output.dense.weight"].T,
allenai_model[
f"encoder.layer.{layer_num}.attention.output.dense.bias"]
])
encoder._transformer_layers[layer_num]._attention_layer_norm.set_weights([
allenai_model[
f"encoder.layer.{layer_num}.attention.output.LayerNorm.weight"],
allenai_model[
f"encoder.layer.{layer_num}.attention.output.LayerNorm.bias"]
])
encoder._transformer_layers[layer_num]._intermediate_dense.set_weights([
allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.weight"].T,
allenai_model[f"encoder.layer.{layer_num}.intermediate.dense.bias"]
])
encoder._transformer_layers[layer_num]._output_dense.set_weights([
allenai_model[f"encoder.layer.{layer_num}.output.dense.weight"].T,
allenai_model[f"encoder.layer.{layer_num}.output.dense.bias"]
])
encoder._transformer_layers[layer_num]._output_layer_norm.set_weights([
allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.weight"],
allenai_model[f"encoder.layer.{layer_num}.output.LayerNorm.bias"]
])
def convert_checkpoint(output_path):
"""Converts and save the checkpoint."""
output_dir, _ = os.path.split(output_path)
tf.io.gfile.makedirs(output_dir)
encoder = _create_longformer_model()
allenai_model = _get_pytorch_longformer_model()
sequence_length = 128
batch_size = 2
word_id_data = np.random.randint(
10, size=(batch_size, sequence_length), dtype=np.int32)
mask_data = np.random.randint(
2, size=(batch_size, sequence_length), dtype=np.int32)
type_id_data = np.random.randint(
2, size=(batch_size, sequence_length), dtype=np.int32)
inputs = {
"input_word_ids": word_id_data,
"input_mask": mask_data,
"input_type_ids": type_id_data,
}
encoder(inputs)
convert(encoder, allenai_model)
tf.train.Checkpoint(encoder=encoder).write(output_path)
def main(_):
convert_checkpoint("longformer-4096/longformer")
if __name__ == "__main__":
app.run(main)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Longformer training examples to Tfrecord."""
import collections
import os
import datasets
import tensorflow as tf
import transformers
pretrained_lm = "allenai/longformer-base-4096"
task_name = "mnli"
save_path = "./"
raw_datasets = datasets.load_dataset("glue", task_name, cache_dir=None)
label_list = raw_datasets["train"].features["label"].names
num_labels = len(label_list)
tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained_lm,
use_fast=True,
)
task_to_keys = {
"cola": ("sentence", None),
"mnli": ("premise", "hypothesis"),
"mrpc": ("sentence1", "sentence2"),
"qnli": ("question", "sentence"),
"qqp": ("question1", "question2"),
"rte": ("sentence1", "sentence2"),
"sst2": ("sentence", None),
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
sentence1_key, sentence2_key = task_to_keys[task_name]
padding = "max_length"
# make sure this is the same with model input size.
max_seq_length = 512
def preprocess_function(examples):
# Tokenize the texts
args = ((examples[sentence1_key],) if sentence2_key is None else
(examples[sentence1_key], examples[sentence2_key]))
result = tokenizer(
*args, padding=padding, max_length=max_seq_length, truncation=True)
return result
raw_datasets = raw_datasets.map(
preprocess_function,
batched=True,
desc="Running tokenizer on dataset",
)
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["validation_matched" if task_name ==
"mnli" else "validation"]
print("train_dataset", train_dataset[0])
print("eval_dataset", eval_dataset[0])
def file_based_convert_examples_to_features(examples, output_file):
"""Convert a set of `InputExample`s to a TFRecord file."""
tf.io.gfile.makedirs(os.path.dirname(output_file))
writer = tf.io.TFRecordWriter(output_file)
for ex_index, example in enumerate(examples):
if ex_index % 10000 == 0:
print(f"Writing example {ex_index} of {len(examples)}")
def create_int_feature(values):
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
return f
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(example["input_ids"])
features["input_mask"] = create_int_feature(example["attention_mask"])
features["segment_ids"] = create_int_feature([0] *
len(example["attention_mask"]))
features["label_ids"] = create_int_feature([example["label"]])
features["is_real_example"] = create_int_feature([1])
features["example_id"] = create_int_feature([example["idx"]])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
writer.close()
file_based_convert_examples_to_features(
train_dataset,
os.path.join(save_path,
f"{pretrained_lm.replace('/', '_')}_train.tf_record"))
file_based_convert_examples_to_features(
eval_dataset,
os.path.join(save_path,
f"{pretrained_lm.replace('/', '_')}_eval.tf_record"))
...@@ -176,8 +176,7 @@ devices. See the [TF Lite Example](#tf-lite-example) to export and run your own ...@@ -176,8 +176,7 @@ devices. See the [TF Lite Example](#tf-lite-example) to export and run your own
models. We also provide [quantized TF Lite binaries via TF Hub](https://tfhub.dev/s?deployment-format=lite&q=movinet). models. We also provide [quantized TF Lite binaries via TF Hub](https://tfhub.dev/s?deployment-format=lite&q=movinet).
For reference, MoViNet-A0-Stream runs with a similar latency to For reference, MoViNet-A0-Stream runs with a similar latency to
[MobileNetV3-Large] [MobileNetV3-Large](https://tfhub.dev/google/imagenet/mobilenet_v3_large_100_224/classification/)
(https://tfhub.dev/google/imagenet/mobilenet_v3_large_100_224/classification/)
with +5% accuracy on Kinetics 600. with +5% accuracy on Kinetics 600.
| Model Name | Input Shape | Pixel 4 Latency\* | x86 Latency\* | TF Lite Binary | | Model Name | Input Shape | Pixel 4 Latency\* | x86 Latency\* | TF Lite Binary |
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Lint as: python3
"""Contains definitions of Mobile Video Networks. """Contains definitions of Mobile Video Networks.
Reference: https://arxiv.org/pdf/2103.11511.pdf Reference: https://arxiv.org/pdf/2103.11511.pdf
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Lint as: python3
"""Contains common building blocks for MoViNets. """Contains common building blocks for MoViNets.
Reference: https://arxiv.org/pdf/2103.11511.pdf Reference: https://arxiv.org/pdf/2103.11511.pdf
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Lint as: python3
"""Tests for movinet_layers.py.""" """Tests for movinet_layers.py."""
from absl.testing import parameterized from absl.testing import parameterized
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Lint as: python3
"""Tests for movinet_model.py.""" """Tests for movinet_model.py."""
from absl.testing import parameterized from absl.testing import parameterized
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Lint as: python3
"""Tests for movinet.py.""" """Tests for movinet.py."""
from absl.testing import parameterized from absl.testing import parameterized
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Lint as: python3
r"""Exports models to tf.saved_model. r"""Exports models to tf.saved_model.
Export example: Export example:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment