Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
8be7de91
Commit
8be7de91
authored
Dec 02, 2021
by
Zihan Wang
Browse files
roformer
parent
59aa6732
Changes
12
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
1751 additions
and
0 deletions
+1751
-0
official/projects/roformer/README.md
official/projects/roformer/README.md
+12
-0
official/projects/roformer/__init__.py
official/projects/roformer/__init__.py
+13
-0
official/projects/roformer/experiments/roformer_base.yaml
official/projects/roformer/experiments/roformer_base.yaml
+62
-0
official/projects/roformer/roformer.py
official/projects/roformer/roformer.py
+56
-0
official/projects/roformer/roformer_attention.py
official/projects/roformer/roformer_attention.py
+104
-0
official/projects/roformer/roformer_attention_test.py
official/projects/roformer/roformer_attention_test.py
+147
-0
official/projects/roformer/roformer_encoder.py
official/projects/roformer/roformer_encoder.py
+288
-0
official/projects/roformer/roformer_encoder_block.py
official/projects/roformer/roformer_encoder_block.py
+321
-0
official/projects/roformer/roformer_encoder_block_test.py
official/projects/roformer/roformer_encoder_block_test.py
+324
-0
official/projects/roformer/roformer_encoder_test.py
official/projects/roformer/roformer_encoder_test.py
+236
-0
official/projects/roformer/roformer_experiments.py
official/projects/roformer/roformer_experiments.py
+119
-0
official/projects/roformer/train.py
official/projects/roformer/train.py
+69
-0
No files found.
official/projects/roformer/README.md
0 → 100644
View file @
8be7de91
Code for Roformer.
Run with
```
bash
python3 train.py
\
--experiment
=
roformer/pretraining
\
--config_file
=
experiments/roformer_base.yaml
\
--params_override
=
"task.validation_data.input_path=gs://tf_model_garden/nlp/data/research_data/bert_pretrain/wikipedia.tfrecord-00000-of-00500,runtime.distribution_strategy=tpu"
\
--tpu
=
local
\
--model_dir
=
<OUTPUT_DIR>
\
--mode
=
train_and_eval
```
\ No newline at end of file
official/projects/roformer/__init__.py
0 → 100644
View file @
8be7de91
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
official/projects/roformer/experiments/roformer_base.yaml
0 → 100644
View file @
8be7de91
This diff is collapsed.
Click to expand it.
official/projects/roformer/roformer.py
0 → 100644
View file @
8be7de91
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Roformer model configurations and instantiation methods."""
import
dataclasses
import
gin
import
tensorflow
as
tf
from
official.modeling
import
tf_utils
from
official.modeling.hyperparams
import
base_config
from
official.nlp.configs
import
encoders
from
roformer_encoder
import
RoformerEncoder
class
RoformerEncoderConfig
(
encoders
.
BertEncoderConfig
):
pass
@
gin
.
configurable
@
base_config
.
bind
(
RoformerEncoderConfig
)
def
get_encoder
(
encoder_cfg
:
RoformerEncoderConfig
):
"""Gets a 'RoformerEncoder' object.
Args:
encoder_cfg: A 'modeling.BertConfig'.
Returns:
A encoder object.
"""
return
RoformerEncoder
(
vocab_size
=
encoder_cfg
.
vocab_size
,
hidden_size
=
encoder_cfg
.
hidden_size
,
num_layers
=
encoder_cfg
.
num_layers
,
num_attention_heads
=
encoder_cfg
.
num_attention_heads
,
intermediate_size
=
encoder_cfg
.
intermediate_size
,
activation
=
tf_utils
.
get_activation
(
encoder_cfg
.
hidden_activation
),
dropout_rate
=
encoder_cfg
.
dropout_rate
,
attention_dropout_rate
=
encoder_cfg
.
attention_dropout_rate
,
max_sequence_length
=
encoder_cfg
.
max_position_embeddings
,
type_vocab_size
=
encoder_cfg
.
type_vocab_size
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
encoder_cfg
.
initializer_range
),
output_range
=
encoder_cfg
.
output_range
,
embedding_width
=
encoder_cfg
.
embedding_size
,
norm_first
=
encoder_cfg
.
norm_first
)
official/projects/roformer/roformer_attention.py
0 → 100644
View file @
8be7de91
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Roformer attention layer"""
# pylint: disable=g-classes-have-attributes
import
math
import
tensorflow
as
tf
EinsumDense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
MultiHeadAttention
=
tf
.
keras
.
layers
.
MultiHeadAttention
def
_build_trig_vector
(
length
,
key_dim
):
tf_dtype
=
tf
.
keras
.
mixed_precision
.
global_policy
().
compute_dtype
position_ids
=
tf
.
cast
(
tf
.
range
(
length
),
dtype
=
tf_dtype
)
position_ids
=
tf
.
expand_dims
(
position_ids
,
axis
=
0
)
steps
=
key_dim
//
2
indices
=
tf
.
cast
(
tf
.
range
(
steps
),
dtype
=
tf_dtype
)
indices
=
tf
.
pow
(
tf
.
constant
(
10000.0
,
dtype
=
tf_dtype
),
-
2
*
indices
/
steps
)
vec
=
tf
.
einsum
(
'bl,d->bld'
,
position_ids
,
indices
)
sin_vec
=
tf
.
repeat
(
tf
.
sin
(
vec
),
repeats
=
2
,
axis
=-
1
)
cos_vec
=
tf
.
repeat
(
tf
.
cos
(
vec
),
repeats
=
2
,
axis
=-
1
)
sin_vec
,
cos_vec
=
tf
.
expand_dims
(
sin_vec
,
2
),
tf
.
expand_dims
(
cos_vec
,
2
)
return
sin_vec
,
cos_vec
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"Text"
)
class
RoformerAttention
(
tf
.
keras
.
layers
.
MultiHeadAttention
):
def
__init__
(
self
,
q_max_sequence_length
,
kv_max_sequence_length
,
output_range
=
None
,
**
kwargs
):
"""
Args:
q_max_sequence_length: maximum length in input for the query
kv_max_sequence_length: maximum length in input for key and value, can be different from q_max_sequence_length
output_range: length of the query tensor to consider.
"""
super
().
__init__
(
**
kwargs
)
self
.
_q_max_sequence_length
=
q_max_sequence_length
self
.
_kv_max_sequence_length
=
kv_max_sequence_length
assert
self
.
_key_dim
%
2
==
0
q_sin_vec
,
q_cos_vec
=
_build_trig_vector
(
self
.
_q_max_sequence_length
,
self
.
_key_dim
)
k_sin_vec
,
k_cos_vec
=
_build_trig_vector
(
self
.
_kv_max_sequence_length
,
self
.
_key_dim
)
self
.
q_sin_vec
,
self
.
q_cos_vec
=
(
q_sin_vec
,
q_cos_vec
)
if
output_range
is
None
else
(
q_sin_vec
[:,
0
:
output_range
,
...],
q_cos_vec
[:,
0
:
output_range
,
...])
self
.
k_sin_vec
,
self
.
k_cos_vec
=
(
k_sin_vec
,
k_cos_vec
)
def
roformer_recompute_qkv
(
self
,
q
,
k
,
v
):
q_shape
=
tf
.
shape
(
q
)
q_len
=
q_shape
[
1
]
k_shape
=
tf
.
shape
(
k
)
k_len
=
k_shape
[
1
]
q2
=
tf
.
stack
([
-
q
[...,
1
::
2
],
q
[...,
::
2
]],
axis
=
4
)
q2
=
tf
.
reshape
(
q2
,
q_shape
)
k2
=
tf
.
stack
([
-
k
[...,
1
::
2
],
k
[...,
::
2
]],
axis
=
4
)
k2
=
tf
.
reshape
(
k2
,
k_shape
)
ret_q
=
q
*
self
.
q_cos_vec
[:,
0
:
q_len
,
...]
+
q2
*
self
.
q_sin_vec
[:,
0
:
q_len
,
...]
ret_w
=
k
*
self
.
k_cos_vec
[:,
0
:
k_len
,
...]
+
k2
*
self
.
k_sin_vec
[:,
0
:
k_len
,
...]
return
ret_q
,
ret_w
,
v
def
call
(
self
,
query
,
value
,
key
=
None
,
attention_mask
=
None
,
return_attention_scores
=
False
,
training
=
None
):
if
not
self
.
_built_from_signature
:
self
.
_build_from_signature
(
query
=
query
,
value
=
value
,
key
=
key
)
if
key
is
None
:
key
=
value
query
=
self
.
_query_dense
(
query
)
key
=
self
.
_key_dense
(
key
)
value
=
self
.
_value_dense
(
value
)
query
,
key
,
value
=
self
.
roformer_recompute_qkv
(
query
,
key
,
value
)
attention_output
,
attention_scores
=
self
.
_compute_attention
(
query
,
key
,
value
,
attention_mask
,
training
)
attention_output
=
self
.
_output_dense
(
attention_output
)
if
return_attention_scores
:
return
attention_output
,
attention_scores
return
attention_output
\ No newline at end of file
official/projects/roformer/roformer_attention_test.py
0 → 100644
View file @
8be7de91
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for the attention layer."""
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.distribute
import
combinations
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
import
roformer_attention
def
_create_mock_attention_data
(
num_heads
,
key_dim
,
value_dim
,
q_seq_length
,
kv_seq_length
,
batch_size
,
include_mask
=
False
):
"""Creates mock testing data.
Args:
num_heads: `int`, Number of attention heads.
key_dim: `int`, Size of query head.
value_dim: `int`, Size of key, value dim.
seq_length: `int`, Sequence length of the input.
batch_size: `int`, the batch size.
include_mask: optional `bool`, whether or not to include mask data.
Returns:
A dictionary with `str` as keys and `Tensor` as values.
"""
query_shape
=
(
batch_size
,
q_seq_length
,
key_dim
)
value_shape
=
(
batch_size
,
kv_seq_length
,
value_dim
)
data
=
dict
(
query
=
tf
.
random
.
normal
(
shape
=
query_shape
),
value
=
tf
.
random
.
normal
(
shape
=
value_shape
),
key
=
tf
.
random
.
normal
(
shape
=
value_shape
))
total_seq_length
=
kv_seq_length
if
include_mask
:
mask_shape
=
(
batch_size
,
num_heads
,
q_seq_length
,
total_seq_length
)
mask_data
=
np
.
random
.
randint
(
2
,
size
=
mask_shape
).
astype
(
"float32"
)
mask_data
=
dict
(
attention_mask
=
mask_data
)
data
.
update
(
mask_data
)
return
data
@
keras_parameterized
.
run_all_keras_modes
class
RoformerAttentionTest
(
keras_parameterized
.
TestCase
):
def
setUp
(
self
):
super
(
RoformerAttentionTest
,
self
).
setUp
()
np
.
random
.
seed
(
0
)
tf
.
random
.
set_seed
(
0
)
@
combinations
.
generate
(
combinations
.
combine
(
length
=
[
8
,
50
],
key_dim
=
[
64
,
128
]))
def
test_trig_vector
(
self
,
length
,
key_dim
):
sin_emb
,
cos_emb
=
roformer_attention
.
_build_trig_vector
(
length
,
key_dim
)
length
=
tf
.
shape
(
sin_emb
)[
1
]
key_dim
=
tf
.
shape
(
sin_emb
)[
3
]
for
m
in
range
(
0
,
length
):
half_d
=
key_dim
//
2
std_emb
=
tf
.
range
(
half_d
,
dtype
=
tf
.
float32
)
std_emb
=
tf
.
pow
(
10000.0
,
-
2
*
std_emb
/
float
(
half_d
))
std_emb
=
m
*
std_emb
std_sin_emb
=
tf
.
sin
(
std_emb
)
std_cos_emb
=
tf
.
cos
(
std_emb
)
tf
.
assert_equal
(
sin_emb
[:,
m
,
:,
0
::
2
],
std_sin_emb
)
tf
.
assert_equal
(
sin_emb
[:,
m
,
:,
1
::
2
],
std_sin_emb
)
tf
.
assert_equal
(
cos_emb
[:,
m
,
:,
0
::
2
],
std_cos_emb
)
tf
.
assert_equal
(
cos_emb
[:,
m
,
:,
1
::
2
],
std_cos_emb
)
@
combinations
.
generate
(
combinations
.
combine
(
value_dim
=
[
32
,
64
],
mask
=
[
True
,
False
]))
def
test_attention_scores
(
self
,
value_dim
,
mask
):
"""Tests combinations of attention score calculations."""
batch_size
,
num_heads
,
key_dim
,
seq_length
=
2
,
12
,
64
,
8
test_layer
=
roformer_attention
.
RoformerAttention
(
q_max_sequence_length
=
seq_length
,
kv_max_sequence_length
=
seq_length
,
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
value_dim
)
data
=
_create_mock_attention_data
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
value_dim
,
q_seq_length
=
seq_length
,
kv_seq_length
=
seq_length
,
batch_size
=
batch_size
,
include_mask
=
mask
)
output
=
test_layer
(
**
data
)
self
.
assertEqual
(
output
.
shape
,
[
batch_size
,
seq_length
,
key_dim
])
@
combinations
.
generate
(
combinations
.
combine
(
value_dim
=
[
32
,
64
],
mask
=
[
True
,
False
]))
def
test_seperate_qkv_attention_scores
(
self
,
value_dim
,
mask
):
"""Tests combinations of attention score calculations."""
batch_size
,
num_heads
,
key_dim
,
q_seq_length
,
kv_seq_length
=
2
,
12
,
64
,
8
,
16
test_layer
=
roformer_attention
.
RoformerAttention
(
q_max_sequence_length
=
q_seq_length
,
kv_max_sequence_length
=
kv_seq_length
,
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
value_dim
)
data
=
_create_mock_attention_data
(
num_heads
=
num_heads
,
key_dim
=
key_dim
,
value_dim
=
value_dim
,
q_seq_length
=
seq_length
,
kv_seq_length
=
seq_length
,
batch_size
=
batch_size
,
include_mask
=
mask
)
output
=
test_layer
(
**
data
)
self
.
assertEqual
(
output
.
shape
,
[
batch_size
,
seq_length
,
key_dim
])
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/projects/roformer/roformer_encoder.py
0 → 100644
View file @
8be7de91
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Roformer encoder network.
modified from official/nlp/keras_nlp/encoders/bert_encoder.py
"""
# pylint: disable=g-classes-have-attributes
import
collections
from
absl
import
logging
import
tensorflow
as
tf
from
official.nlp.modeling
import
networks
from
official.nlp.modeling.layers
import
on_device_embedding
from
official.nlp.modeling.layers
import
self_attention_mask
from
roformer_encoder_block
import
RoformerEncoderBlock
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
RoformerEncoder
(
tf
.
keras
.
Model
):
"""Bi-directional Transformer-based encoder network.
This network implements a bi-directional Transformer-based encoder as
described in "BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
embedding lookups and transformer layers, but not the masked language model
or classification task networks.
The default values for this object are taken from the BERT-Base implementation
in "BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding".
*Note* that the network is constructed by
[Keras Functional API](https://keras.io/guides/functional_api/).
Args:
vocab_size: The size of the token vocabulary.
hidden_size: The size of the transformer hidden layers.
num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
type_vocab_size: The number of types that the 'type_ids' input can take.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network for each transformer.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network for each transformer.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: The dropout rate to use for the attention layers
within the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yields the full
output.
embedding_width: The width of the word embeddings. If the embedding width is
not equal to hidden size, embedding parameters will be factorized into two
matrices in the shape of ['vocab_size', 'embedding_width'] and
['embedding_width', 'hidden_size'] ('embedding_width' is usually much
smaller than 'hidden_size').
embedding_layer: An optional Layer instance which will be called to
generate embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
"""
def
__init__
(
self
,
vocab_size
,
hidden_size
=
768
,
# FIXME: hidden_size per head should be even!
num_layers
=
12
,
num_attention_heads
=
12
,
max_sequence_length
=
512
,
type_vocab_size
=
16
,
inner_dim
=
3072
,
inner_activation
=
lambda
x
:
tf
.
keras
.
activations
.
gelu
(
x
,
approximate
=
True
),
output_dropout
=
0.1
,
attention_dropout
=
0.1
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
output_range
=
None
,
embedding_width
=
None
,
embedding_layer
=
None
,
norm_first
=
False
,
**
kwargs
):
if
"intermediate_size"
in
kwargs
:
inner_dim
=
kwargs
[
"intermediate_size"
]
del
kwargs
[
"intermediate_size"
]
if
"activation"
in
kwargs
:
inner_activation
=
kwargs
[
"activation"
]
del
kwargs
[
"activation"
]
if
"dropout_rate"
in
kwargs
:
output_dropout
=
kwargs
[
"dropout_rate"
]
del
kwargs
[
"dropout_rate"
]
if
"attention_dropout_rate"
in
kwargs
:
attention_dropout
=
kwargs
[
"attention_dropout_rate"
]
del
kwargs
[
"attention_dropout_rate"
]
activation
=
tf
.
keras
.
activations
.
get
(
inner_activation
)
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
word_ids
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
'input_word_ids'
)
mask
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
'input_mask'
)
type_ids
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
'input_type_ids'
)
if
embedding_width
is
None
:
embedding_width
=
hidden_size
if
embedding_layer
is
None
:
embedding_layer_inst
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
name
=
'word_embeddings'
)
else
:
embedding_layer_inst
=
embedding_layer
word_embeddings
=
embedding_layer_inst
(
word_ids
)
# Roformer does not need a position embedding layer
type_embedding_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
type_vocab_size
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
use_one_hot
=
True
,
name
=
'type_embeddings'
)
type_embeddings
=
type_embedding_layer
(
type_ids
)
# Roformer does not have absolute position embedding
embeddings
=
tf
.
keras
.
layers
.
Add
()(
[
word_embeddings
,
type_embeddings
])
embedding_norm_layer
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
'embeddings/layer_norm'
,
axis
=-
1
,
epsilon
=
1e-12
,
dtype
=
tf
.
float32
)
embeddings
=
embedding_norm_layer
(
embeddings
)
embeddings
=
(
tf
.
keras
.
layers
.
Dropout
(
rate
=
output_dropout
)(
embeddings
))
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
if
embedding_width
!=
hidden_size
:
embedding_projection
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
'...x,xy->...y'
,
output_shape
=
hidden_size
,
bias_axes
=
'y'
,
kernel_initializer
=
initializer
,
name
=
'embedding_projection'
)
embeddings
=
embedding_projection
(
embeddings
)
else
:
embedding_projection
=
None
transformer_layers
=
[]
data
=
embeddings
attention_mask
=
self_attention_mask
.
SelfAttentionMask
()(
data
,
mask
)
encoder_outputs
=
[]
for
i
in
range
(
num_layers
):
if
i
==
num_layers
-
1
and
output_range
is
not
None
:
transformer_output_range
=
output_range
else
:
transformer_output_range
=
None
layer
=
RoformerEncoderBlock
(
num_attention_heads
=
num_attention_heads
,
inner_dim
=
inner_dim
,
inner_activation
=
inner_activation
,
q_max_sequence_length
=
max_sequence_length
,
kv_max_sequence_length
=
max_sequence_length
,
output_dropout
=
output_dropout
,
attention_dropout
=
attention_dropout
,
norm_first
=
norm_first
,
output_range
=
transformer_output_range
,
kernel_initializer
=
initializer
,
name
=
'roformer/layer_%d'
%
i
)
transformer_layers
.
append
(
layer
)
data
=
layer
([
data
,
attention_mask
])
encoder_outputs
.
append
(
data
)
last_encoder_output
=
encoder_outputs
[
-
1
]
# Applying a tf.slice op (through subscript notation) to a Keras tensor
# like this will create a SliceOpLambda layer. This is better than a Lambda
# layer with Python code, because that is fundamentally less portable.
first_token_tensor
=
last_encoder_output
[:,
0
,
:]
pooler_layer
=
tf
.
keras
.
layers
.
Dense
(
units
=
hidden_size
,
activation
=
'tanh'
,
kernel_initializer
=
initializer
,
name
=
'pooler_transform'
)
cls_output
=
pooler_layer
(
first_token_tensor
)
outputs
=
dict
(
sequence_output
=
encoder_outputs
[
-
1
],
pooled_output
=
cls_output
,
encoder_outputs
=
encoder_outputs
,
)
# Once we've created the network using the Functional API, we call
# super().__init__ as though we were invoking the Functional API Model
# constructor, resulting in this object having all the properties of a model
# created using the Functional API. Once super().__init__ is called, we
# can assign attributes to `self` - note that all `self` assignments are
# below this line.
super
(
RoformerEncoder
,
self
).
__init__
(
inputs
=
[
word_ids
,
mask
,
type_ids
],
outputs
=
outputs
,
**
kwargs
)
config_dict
=
{
'vocab_size'
:
vocab_size
,
'hidden_size'
:
hidden_size
,
'num_layers'
:
num_layers
,
'num_attention_heads'
:
num_attention_heads
,
'max_sequence_length'
:
max_sequence_length
,
'type_vocab_size'
:
type_vocab_size
,
'inner_dim'
:
inner_dim
,
'inner_activation'
:
tf
.
keras
.
activations
.
serialize
(
activation
),
'output_dropout'
:
output_dropout
,
'attention_dropout'
:
attention_dropout
,
'initializer'
:
tf
.
keras
.
initializers
.
serialize
(
initializer
),
'output_range'
:
output_range
,
'embedding_width'
:
embedding_width
,
'embedding_layer'
:
embedding_layer
,
'norm_first'
:
norm_first
,
}
# We are storing the config dict as a namedtuple here to ensure checkpoint
# compatibility with an earlier version of this model which did not track
# the config dict attribute. TF does not track immutable attrs which
# do not contain Trackables, so by creating a config namedtuple instead of
# a dict we avoid tracking it.
config_cls
=
collections
.
namedtuple
(
'Config'
,
config_dict
.
keys
())
self
.
_config
=
config_cls
(
**
config_dict
)
self
.
_pooler_layer
=
pooler_layer
self
.
_transformer_layers
=
transformer_layers
self
.
_embedding_norm_layer
=
embedding_norm_layer
self
.
_embedding_layer
=
embedding_layer_inst
# self._position_embedding_layer = position_embedding_layer
self
.
_position_embedding_layer
=
None
self
.
_type_embedding_layer
=
type_embedding_layer
if
embedding_projection
is
not
None
:
self
.
_embedding_projection
=
embedding_projection
def
get_embedding_table
(
self
):
return
self
.
_embedding_layer
.
embeddings
def
get_embedding_layer
(
self
):
return
self
.
_embedding_layer
def
get_config
(
self
):
return
dict
(
self
.
_config
.
_asdict
())
@
property
def
transformer_layers
(
self
):
"""List of Transformer layers in the encoder."""
return
self
.
_transformer_layers
@
property
def
pooler_layer
(
self
):
"""The pooler dense layer after the transformer layers."""
return
self
.
_pooler_layer
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
if
'embedding_layer'
in
config
and
config
[
'embedding_layer'
]
is
not
None
:
warn_string
=
(
'You are reloading a model that was saved with a '
'potentially-shared embedding layer object. If you contine to '
'train this model, the embedding layer will no longer be shared. '
'To work around this, load the model outside of the Keras API.'
)
print
(
'WARNING: '
+
warn_string
)
logging
.
warn
(
warn_string
)
return
cls
(
**
config
)
official/projects/roformer/roformer_encoder_block.py
0 → 100644
View file @
8be7de91
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
modified from official/nlp/keras_nlp/layers/transformer_encoder_block.py
"""
"""Keras-based TransformerEncoder block layer."""
import
tensorflow
as
tf
from
roformer_attention
import
RoformerAttention
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
RoformerEncoderBlock
(
tf
.
keras
.
layers
.
Layer
):
"""TransformerEncoderBlock layer.
This layer implements the Transformer Encoder from
"Attention Is All You Need". (https://arxiv.org/abs/1706.03762),
which combines a `tf.keras.layers.MultiHeadAttention` layer with a
two-layer feedforward network.
References:
[Attention Is All You Need](https://arxiv.org/abs/1706.03762)
[BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding](https://arxiv.org/abs/1810.04805)
"""
def
__init__
(
self
,
num_attention_heads
,
inner_dim
,
# FIXME: hidden_size per head should be even!
inner_activation
,
q_max_sequence_length
=
512
,
kv_max_sequence_length
=
512
,
output_range
=
None
,
kernel_initializer
=
"glorot_uniform"
,
bias_initializer
=
"zeros"
,
kernel_regularizer
=
None
,
bias_regularizer
=
None
,
activity_regularizer
=
None
,
kernel_constraint
=
None
,
bias_constraint
=
None
,
use_bias
=
True
,
norm_first
=
False
,
norm_epsilon
=
1e-12
,
output_dropout
=
0.0
,
attention_dropout
=
0.0
,
inner_dropout
=
0.0
,
attention_initializer
=
None
,
attention_axes
=
None
,
**
kwargs
):
"""Initializes `TransformerEncoderBlock`.
Args:
num_attention_heads: Number of attention heads.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network.
output_range: the sequence output range, [0, output_range) for slicing the
target sequence. `None` means the target sequence is not sliced.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
use_bias: Whether to enable use_bias in attention layer. If set False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: Dropout probability for within the attention layer.
inner_dropout: Dropout probability for the first Dense layer in a
two-layer feedforward network.
attention_initializer: Initializer for kernels of attention layers. If set
`None`, attention layers use kernel_initializer as initializer for
kernel.
attention_axes: axes over which the attention is applied. `None` means
attention over all axes, but batch, heads, and features.
**kwargs: keyword arguments/
"""
super
().
__init__
(
**
kwargs
)
self
.
_num_heads
=
num_attention_heads
self
.
_inner_dim
=
inner_dim
self
.
_inner_activation
=
inner_activation
self
.
_attention_dropout
=
attention_dropout
self
.
_attention_dropout_rate
=
attention_dropout
self
.
_output_dropout
=
output_dropout
self
.
_output_dropout_rate
=
output_dropout
self
.
_output_range
=
output_range
self
.
_kernel_initializer
=
tf
.
keras
.
initializers
.
get
(
kernel_initializer
)
self
.
_bias_initializer
=
tf
.
keras
.
initializers
.
get
(
bias_initializer
)
self
.
_kernel_regularizer
=
tf
.
keras
.
regularizers
.
get
(
kernel_regularizer
)
self
.
_bias_regularizer
=
tf
.
keras
.
regularizers
.
get
(
bias_regularizer
)
self
.
_activity_regularizer
=
tf
.
keras
.
regularizers
.
get
(
activity_regularizer
)
self
.
_kernel_constraint
=
tf
.
keras
.
constraints
.
get
(
kernel_constraint
)
self
.
_bias_constraint
=
tf
.
keras
.
constraints
.
get
(
bias_constraint
)
self
.
_use_bias
=
use_bias
self
.
_norm_first
=
norm_first
self
.
_norm_epsilon
=
norm_epsilon
self
.
_inner_dropout
=
inner_dropout
self
.
_q_max_sequence_length
=
q_max_sequence_length
self
.
_kv_max_sequence_length
=
kv_max_sequence_length
if
attention_initializer
:
self
.
_attention_initializer
=
tf
.
keras
.
initializers
.
get
(
attention_initializer
)
else
:
self
.
_attention_initializer
=
self
.
_kernel_initializer
self
.
_attention_axes
=
attention_axes
def
build
(
self
,
input_shape
):
if
isinstance
(
input_shape
,
tf
.
TensorShape
):
input_tensor_shape
=
input_shape
elif
isinstance
(
input_shape
,
(
list
,
tuple
)):
input_tensor_shape
=
tf
.
TensorShape
(
input_shape
[
0
])
else
:
raise
ValueError
(
"The type of input shape argument is not supported, got: %s"
%
type
(
input_shape
))
einsum_equation
=
"abc,cd->abd"
if
len
(
input_tensor_shape
.
as_list
())
>
3
:
einsum_equation
=
"...bc,cd->...bd"
hidden_size
=
input_tensor_shape
[
-
1
]
if
hidden_size
%
self
.
_num_heads
!=
0
:
raise
ValueError
(
"The input size (%d) is not a multiple of the number of attention "
"heads (%d)"
%
(
hidden_size
,
self
.
_num_heads
))
self
.
_attention_head_size
=
int
(
hidden_size
//
self
.
_num_heads
)
common_kwargs
=
dict
(
bias_initializer
=
self
.
_bias_initializer
,
kernel_regularizer
=
self
.
_kernel_regularizer
,
bias_regularizer
=
self
.
_bias_regularizer
,
activity_regularizer
=
self
.
_activity_regularizer
,
kernel_constraint
=
self
.
_kernel_constraint
,
bias_constraint
=
self
.
_bias_constraint
)
self
.
_attention_layer
=
RoformerAttention
(
q_max_sequence_length
=
self
.
_q_max_sequence_length
,
kv_max_sequence_length
=
self
.
_kv_max_sequence_length
,
output_range
=
self
.
_output_range
,
num_heads
=
self
.
_num_heads
,
key_dim
=
self
.
_attention_head_size
,
dropout
=
self
.
_attention_dropout
,
use_bias
=
self
.
_use_bias
,
kernel_initializer
=
self
.
_attention_initializer
,
attention_axes
=
self
.
_attention_axes
,
name
=
"self_attention"
,
**
common_kwargs
)
self
.
_attention_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_output_dropout
)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
self
.
_attention_layer_norm
=
(
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"self_attention_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
,
dtype
=
tf
.
float32
))
self
.
_intermediate_dense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
einsum_equation
,
output_shape
=
(
None
,
self
.
_inner_dim
),
bias_axes
=
"d"
,
kernel_initializer
=
self
.
_kernel_initializer
,
name
=
"intermediate"
,
**
common_kwargs
)
policy
=
tf
.
keras
.
mixed_precision
.
global_policy
()
if
policy
.
name
==
"mixed_bfloat16"
:
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
# TODO(b/154538392): Investigate this.
policy
=
tf
.
float32
self
.
_intermediate_activation_layer
=
tf
.
keras
.
layers
.
Activation
(
self
.
_inner_activation
,
dtype
=
policy
)
self
.
_inner_dropout_layer
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_inner_dropout
)
self
.
_output_dense
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
einsum_equation
,
output_shape
=
(
None
,
hidden_size
),
bias_axes
=
"d"
,
name
=
"output"
,
kernel_initializer
=
self
.
_kernel_initializer
,
**
common_kwargs
)
self
.
_output_dropout
=
tf
.
keras
.
layers
.
Dropout
(
rate
=
self
.
_output_dropout
)
# Use float32 in layernorm for numeric stability.
self
.
_output_layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
"output_layer_norm"
,
axis
=-
1
,
epsilon
=
self
.
_norm_epsilon
,
dtype
=
tf
.
float32
)
super
(
RoformerEncoderBlock
,
self
).
build
(
input_shape
)
def
get_config
(
self
):
config
=
{
"num_attention_heads"
:
self
.
_num_heads
,
"inner_dim"
:
self
.
_inner_dim
,
"inner_activation"
:
self
.
_inner_activation
,
"output_dropout"
:
self
.
_output_dropout_rate
,
"attention_dropout"
:
self
.
_attention_dropout_rate
,
"output_range"
:
self
.
_output_range
,
"kernel_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_kernel_initializer
),
"bias_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_bias_initializer
),
"kernel_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_kernel_regularizer
),
"bias_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_bias_regularizer
),
"activity_regularizer"
:
tf
.
keras
.
regularizers
.
serialize
(
self
.
_activity_regularizer
),
"kernel_constraint"
:
tf
.
keras
.
constraints
.
serialize
(
self
.
_kernel_constraint
),
"bias_constraint"
:
tf
.
keras
.
constraints
.
serialize
(
self
.
_bias_constraint
),
"use_bias"
:
self
.
_use_bias
,
"norm_first"
:
self
.
_norm_first
,
"norm_epsilon"
:
self
.
_norm_epsilon
,
"inner_dropout"
:
self
.
_inner_dropout
,
"attention_initializer"
:
tf
.
keras
.
initializers
.
serialize
(
self
.
_attention_initializer
),
"attention_axes"
:
self
.
_attention_axes
,
}
base_config
=
super
(
RoformerEncoderBlock
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
call
(
self
,
inputs
):
"""Transformer self-attention encoder block call.
Args:
inputs: a single tensor or a list of tensors.
`input tensor` as the single sequence of embeddings.
[`input tensor`, `attention mask`] to have the additional attention
mask.
[`query tensor`, `key value tensor`, `attention mask`] to have separate
input streams for the query, and key/value to the multi-head
attention.
Returns:
An output tensor with the same dimensions as input/query tensor.
"""
if
isinstance
(
inputs
,
(
list
,
tuple
)):
if
len
(
inputs
)
==
2
:
input_tensor
,
attention_mask
=
inputs
key_value
=
None
elif
len
(
inputs
)
==
3
:
input_tensor
,
key_value
,
attention_mask
=
inputs
else
:
raise
ValueError
(
"Unexpected inputs to %s with length at %d"
%
(
self
.
__class__
,
len
(
inputs
)))
else
:
input_tensor
,
key_value
,
attention_mask
=
(
inputs
,
None
,
None
)
if
self
.
_output_range
:
if
self
.
_norm_first
:
source_tensor
=
input_tensor
[:,
0
:
self
.
_output_range
,
:]
input_tensor
=
self
.
_attention_layer_norm
(
input_tensor
)
if
key_value
is
not
None
:
key_value
=
self
.
_attention_layer_norm
(
key_value
)
target_tensor
=
input_tensor
[:,
0
:
self
.
_output_range
,
:]
if
attention_mask
is
not
None
:
attention_mask
=
attention_mask
[:,
0
:
self
.
_output_range
,
:]
else
:
if
self
.
_norm_first
:
source_tensor
=
input_tensor
input_tensor
=
self
.
_attention_layer_norm
(
input_tensor
)
if
key_value
is
not
None
:
key_value
=
self
.
_attention_layer_norm
(
key_value
)
target_tensor
=
input_tensor
if
key_value
is
None
:
key_value
=
input_tensor
attention_output
=
self
.
_attention_layer
(
query
=
target_tensor
,
value
=
key_value
,
attention_mask
=
attention_mask
)
attention_output
=
self
.
_attention_dropout
(
attention_output
)
if
self
.
_norm_first
:
attention_output
=
source_tensor
+
attention_output
else
:
attention_output
=
self
.
_attention_layer_norm
(
target_tensor
+
attention_output
)
if
self
.
_norm_first
:
source_attention_output
=
attention_output
attention_output
=
self
.
_output_layer_norm
(
attention_output
)
inner_output
=
self
.
_intermediate_dense
(
attention_output
)
inner_output
=
self
.
_intermediate_activation_layer
(
inner_output
)
inner_output
=
self
.
_inner_dropout_layer
(
inner_output
)
layer_output
=
self
.
_output_dense
(
inner_output
)
layer_output
=
self
.
_output_dropout
(
layer_output
)
if
self
.
_norm_first
:
return
source_attention_output
+
layer_output
# During mixed precision training, layer norm output is always fp32 for now.
# Casts fp32 for the subsequent add.
layer_output
=
tf
.
cast
(
layer_output
,
tf
.
float32
)
return
self
.
_output_layer_norm
(
layer_output
+
attention_output
)
official/projects/roformer/roformer_encoder_block_test.py
0 → 100644
View file @
8be7de91
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for Keras-based transformer block layer."""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
roformer_encoder_block
import
RoformerEncoderBlock
@
keras_parameterized
.
run_all_keras_modes
@
parameterized
.
named_parameters
(
(
'base'
,
RoformerEncoderBlock
))
class
RoformerEncoderBlockLayerTest
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
RoformerEncoderBlockLayerTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'float32'
)
def
test_layer_creation
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output_tensor
=
test_layer
(
data_tensor
)
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output_tensor
.
shape
.
as_list
())
def
test_layer_creation_with_mask
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output_tensor
.
shape
.
as_list
())
def
test_layer_invocation
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output_tensor
=
test_layer
(
data_tensor
)
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
(
data_tensor
,
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
_
=
model
.
predict
(
input_data
)
def
test_layer_invocation_with_mask
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
([
data_tensor
,
mask_tensor
],
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
_
=
model
.
predict
([
input_data
,
mask_data
])
def
test_layer_output_range
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
input_data
,
mask_data
])
# The layer only attends to the first token and outputs the first token
# embedding.
new_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
output_range
=
1
)
_
=
new_layer
([
input_data
,
mask_data
])
new_layer
.
set_weights
(
test_layer
.
get_weights
())
new_output_tensor
=
new_layer
([
input_data
,
mask_data
])
self
.
assertAllClose
(
new_output_tensor
,
output_tensor
[:,
0
:
1
,
:],
atol
=
5e-5
,
rtol
=
0.003
)
def
test_layer_output_range_without_mask
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
norm_first
=
True
)
sequence_length
=
21
width
=
80
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
output_tensor
=
test_layer
(
input_data
)
# The layer only attends to the first token and outputs the first token
# embedding.
new_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
output_range
=
1
,
norm_first
=
True
)
_
=
new_layer
(
input_data
)
new_layer
.
set_weights
(
test_layer
.
get_weights
())
new_output_tensor
=
new_layer
(
input_data
)
self
.
assertAllClose
(
new_output_tensor
,
output_tensor
[:,
0
:
1
,
:],
atol
=
5e-5
,
rtol
=
0.003
)
def
test_layer_output_range_with_pre_norm
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
norm_first
=
True
)
sequence_length
=
21
width
=
80
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
input_data
,
mask_data
])
# The layer only attends to the first token and outputs the first token
# embedding.
new_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
output_range
=
1
,
norm_first
=
True
)
_
=
new_layer
([
input_data
,
mask_data
])
new_layer
.
set_weights
(
test_layer
.
get_weights
())
new_output_tensor
=
new_layer
([
input_data
,
mask_data
])
self
.
assertAllClose
(
new_output_tensor
,
output_tensor
[:,
0
:
1
,
:],
atol
=
5e-5
,
rtol
=
0.003
)
def
test_layer_invocation_with_float16_dtype
(
self
,
transformer_cls
):
tf
.
keras
.
mixed_precision
.
set_global_policy
(
'mixed_float16'
)
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
)
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
# Create a 2-dimensional input (the first dimension is implicit).
mask_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
data_tensor
,
mask_tensor
])
# Create a model from the test layer.
model
=
tf
.
keras
.
Model
([
data_tensor
,
mask_tensor
],
output_tensor
)
# Invoke the model on test data. We can't validate the output data itself
# (the NN is too complex) but this will rule out structural runtime errors.
batch_size
=
6
input_data
=
(
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
)))
# The attention mask should be of shape (batch, from_seq_len, to_seq_len),
# which here is (batch, sequence_length, sequence_length)
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
_
=
model
.
predict
([
input_data
,
mask_data
])
def
test_transform_with_initializer
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
kernel_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
))
sequence_length
=
21
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,
width
))
output
=
test_layer
(
data_tensor
)
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output
.
shape
.
as_list
())
# def test_dynamic_layer_sequence(self, transformer_cls):
# test_layer = transformer_cls(
# num_attention_heads=10,
# inner_dim=2048,
# inner_activation='relu',
# kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
# # Create a 3-dimensional input (the first dimension is implicit).
# width = 30
# input_tensor = tf.keras.Input(shape=(None, width))
# output_tensor = test_layer(input_tensor)
# model = tf.keras.Model(input_tensor, output_tensor)
#
# input_length = 17
# input_data = np.ones((1, input_length, width))
# output_data = model.predict(input_data)
#
# self.assertAllEqual([1, input_length, width], output_data.shape)
def
test_separate_qkv
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
2
,
inner_dim
=
128
,
inner_activation
=
'relu'
,
kernel_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
))
# Forward path.
q_tensor
=
tf
.
zeros
([
2
,
4
,
16
],
dtype
=
tf
.
float32
)
kv_tensor
=
tf
.
zeros
([
2
,
8
,
16
],
dtype
=
tf
.
float32
)
dummy_mask
=
tf
.
zeros
([
2
,
4
,
8
],
dtype
=
tf
.
float32
)
inputs
=
[
q_tensor
,
kv_tensor
,
dummy_mask
]
output
=
test_layer
(
inputs
)
self
.
assertEqual
(
output
.
shape
,
q_tensor
.
shape
)
@
keras_parameterized
.
run_all_keras_modes
class
RoformerArgumentTest
(
keras_parameterized
.
TestCase
):
def
test_use_bias_norm_first
(
self
):
num_attention_heads
=
2
hidden_size
=
16
encoder_block
=
RoformerEncoderBlockLayerTest
(
num_attention_heads
=
num_attention_heads
,
inner_dim
=
32
,
inner_activation
=
'relu'
,
output_dropout
=
0.1
,
attention_dropout
=
0.1
,
use_bias
=
False
,
norm_first
=
True
,
norm_epsilon
=
1e-6
,
inner_dropout
=
0.1
,
attention_initializer
=
tf
.
keras
.
initializers
.
RandomUniform
(
minval
=
0.
,
maxval
=
1.
))
# Forward path.
dummy_tensor
=
tf
.
zeros
([
2
,
4
,
16
],
dtype
=
tf
.
float32
)
dummy_mask
=
tf
.
zeros
([
2
,
4
,
4
],
dtype
=
tf
.
float32
)
inputs
=
[
dummy_tensor
,
dummy_mask
]
output
=
encoder_block
(
inputs
)
self
.
assertEqual
(
output
.
shape
,
(
2
,
4
,
hidden_size
))
def
test_get_config
(
self
):
num_attention_heads
=
2
encoder_block
=
RoformerEncoderBlock
(
num_attention_heads
=
num_attention_heads
,
inner_dim
=
32
,
inner_activation
=
'relu'
,
output_dropout
=
0.1
,
attention_dropout
=
0.1
,
use_bias
=
False
,
norm_first
=
True
,
norm_epsilon
=
1e-6
,
inner_dropout
=
0.1
,
attention_initializer
=
tf
.
keras
.
initializers
.
RandomUniform
(
minval
=
0.
,
maxval
=
1.
))
encoder_block_config
=
encoder_block
.
get_config
()
new_encoder_block
=
RoformerEncoderBlock
.
from_config
(
encoder_block_config
)
self
.
assertEqual
(
encoder_block_config
,
new_encoder_block
.
get_config
())
@
parameterized
.
parameters
({
'attention_axes'
:
None
},
{
'attention_axes'
:
[
1
]},
{
'attention_axes'
:
[
2
]},
{
'attention_axes'
:
[
1
,
2
]})
def
test_several_attention_axes
(
self
,
attention_axes
):
test_layer
=
RoformerEncoderBlock
(
inner_dim
=
32
,
inner_activation
=
'relu'
,
output_dropout
=
0.1
,
attention_dropout
=
0.1
,
use_bias
=
False
,
norm_first
=
True
,
norm_epsilon
=
1e-6
,
inner_dropout
=
0.1
,
num_attention_heads
=
10
,
attention_axes
=
attention_axes
)
num_rows
=
21
num_cols
=
13
width
=
80
# Create a 3-dimensional input (the first dimension is implicit).
data_tensor
=
tf
.
keras
.
Input
(
shape
=
(
num_rows
,
num_cols
,
width
))
output_tensor
=
test_layer
(
data_tensor
)
# The default output of a transformer layer should be the same as the input.
self
.
assertEqual
(
data_tensor
.
shape
.
as_list
(),
output_tensor
.
shape
.
as_list
())
if
__name__
==
'__main__'
:
tf
.
test
.
main
()
official/projects/roformer/roformer_encoder_test.py
0 → 100644
View file @
8be7de91
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for transformer-based bert encoder network."""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.configs
import
encoders
from
official.nlp.modeling
import
networks
import
roformer_encoder
import
roformer
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@
keras_parameterized
.
run_all_keras_modes
class
RoformerEncoderTest
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
RoformerEncoderTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
set_global_policy
(
"float32"
)
def
test_network_creation
(
self
):
hidden_size
=
32
sequence_length
=
21
# Create a small BertEncoder for testing.
test_network
=
roformer_encoder
.
RoformerEncoder
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
self
.
assertIsInstance
(
test_network
.
transformer_layers
,
list
)
self
.
assertLen
(
test_network
.
transformer_layers
,
3
)
self
.
assertIsInstance
(
test_network
.
pooler_layer
,
tf
.
keras
.
layers
.
Dense
)
expected_data_shape
=
[
None
,
sequence_length
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# The default output dtype is float32.
self
.
assertAllEqual
(
tf
.
float32
,
data
.
dtype
)
self
.
assertAllEqual
(
tf
.
float32
,
pooled
.
dtype
)
def
test_all_encoder_outputs_network_creation
(
self
):
hidden_size
=
32
sequence_length
=
21
# Create a small BertEncoder for testing.
test_network
=
roformer_encoder
.
RoformerEncoder
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
all_encoder_outputs
=
dict_outputs
[
"encoder_outputs"
]
pooled
=
dict_outputs
[
"pooled_output"
]
expected_data_shape
=
[
None
,
sequence_length
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertLen
(
all_encoder_outputs
,
3
)
for
data
in
all_encoder_outputs
:
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# The default output dtype is float32.
self
.
assertAllEqual
(
tf
.
float32
,
all_encoder_outputs
[
-
1
].
dtype
)
self
.
assertAllEqual
(
tf
.
float32
,
pooled
.
dtype
)
def
test_network_creation_with_float16_dtype
(
self
):
hidden_size
=
32
sequence_length
=
21
tf
.
keras
.
mixed_precision
.
set_global_policy
(
"mixed_float16"
)
# Create a small BertEncoder for testing.
test_network
=
roformer_encoder
.
RoformerEncoder
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
expected_data_shape
=
[
None
,
sequence_length
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self
.
assertAllEqual
(
tf
.
float32
,
data
.
dtype
)
self
.
assertAllEqual
(
tf
.
float16
,
pooled
.
dtype
)
@
parameterized
.
named_parameters
(
(
"all_sequence"
,
None
,
21
),
(
"output_range"
,
1
,
1
),
)
def
test_network_invocation
(
self
,
output_range
,
out_seq_len
):
hidden_size
=
32
sequence_length
=
21
vocab_size
=
57
num_types
=
7
# Create a small BertEncoder for testing.
test_network
=
roformer_encoder
.
RoformerEncoder
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
,
output_range
=
output_range
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
# Create a model based off of this network:
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
# Invoke the model. We can't validate the output data here (the model is too
# complex) but this will catch structural runtime errors.
batch_size
=
3
word_id_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
))
type_id_data
=
np
.
random
.
randint
(
num_types
,
size
=
(
batch_size
,
sequence_length
))
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
1
],
out_seq_len
)
# Creates a BertEncoder with max_sequence_length != sequence_length
max_sequence_length
=
128
test_network
=
roformer_encoder
.
RoformerEncoder
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
max_sequence_length
=
max_sequence_length
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
1
],
sequence_length
)
# Creates a BertEncoder with embedding_width != hidden_size
test_network
=
roformer_encoder
.
RoformerEncoder
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
max_sequence_length
=
max_sequence_length
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
,
embedding_width
=
16
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
-
1
],
hidden_size
)
self
.
assertTrue
(
hasattr
(
test_network
,
"_embedding_projection"
))
def
test_serialize_deserialize
(
self
):
# Create a network object that sets all of its config options.
kwargs
=
dict
(
vocab_size
=
100
,
hidden_size
=
32
,
num_layers
=
3
,
num_attention_heads
=
2
,
max_sequence_length
=
21
,
type_vocab_size
=
12
,
inner_dim
=
1223
,
inner_activation
=
"relu"
,
output_dropout
=
0.05
,
attention_dropout
=
0.22
,
initializer
=
"glorot_uniform"
,
output_range
=-
1
,
embedding_width
=
16
,
embedding_layer
=
None
,
norm_first
=
False
)
network
=
roformer_encoder
.
RoformerEncoder
(
**
kwargs
)
expected_config
=
dict
(
kwargs
)
expected_config
[
"inner_activation"
]
=
tf
.
keras
.
activations
.
serialize
(
tf
.
keras
.
activations
.
get
(
expected_config
[
"inner_activation"
]))
expected_config
[
"initializer"
]
=
tf
.
keras
.
initializers
.
serialize
(
tf
.
keras
.
initializers
.
get
(
expected_config
[
"initializer"
]))
self
.
assertEqual
(
network
.
get_config
(),
expected_config
)
# Create another network object from the first object's config.
new_network
=
roformer_encoder
.
RoformerEncoder
.
from_config
(
network
.
get_config
())
# Validate that the config can be forced to JSON.
_
=
network
.
to_json
()
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
network
.
get_config
(),
new_network
.
get_config
())
# Tests model saving/loading.
model_path
=
self
.
get_temp_dir
()
+
"/model"
network
.
save
(
model_path
)
_
=
tf
.
keras
.
models
.
load_model
(
model_path
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/projects/roformer/roformer_experiments.py
0 → 100644
View file @
8be7de91
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Copied from nlp/projects/bigbird/experiment_configs.py
"""
"""Bigbird experiment configurations."""
# pylint: disable=g-doc-return-or-yield,line-too-long
import
dataclasses
from
official.core
import
config_definitions
as
cfg
from
official.core
import
exp_factory
from
official.modeling
import
optimization
from
official.nlp.data
import
pretrain_dataloader
from
official.nlp.tasks
import
masked_lm
from
official.nlp.data
import
sentence_prediction_dataloader
from
official.nlp.tasks
import
sentence_prediction
from
official.nlp.configs
import
bert
from
official.nlp.configs
import
encoders
import
roformer
AdamWeightDecay
=
optimization
.
AdamWeightDecayConfig
PolynomialLr
=
optimization
.
PolynomialLrConfig
PolynomialWarmupConfig
=
optimization
.
PolynomialWarmupConfig
@
dataclasses
.
dataclass
class
RoformerOptimizationConfig
(
optimization
.
OptimizationConfig
):
"""TEAMS optimization config."""
optimizer
:
optimization
.
OptimizerConfig
=
optimization
.
OptimizerConfig
(
type
=
"adamw"
,
adamw
=
AdamWeightDecay
(
weight_decay_rate
=
0.01
,
exclude_from_weight_decay
=
[
"LayerNorm"
,
"layer_norm"
,
"bias"
],
epsilon
=
1e-6
))
learning_rate
:
optimization
.
LrConfig
=
optimization
.
LrConfig
(
type
=
"polynomial"
,
polynomial
=
PolynomialLr
(
initial_learning_rate
=
1e-4
,
decay_steps
=
1000000
,
end_learning_rate
=
0.0
))
warmup
:
optimization
.
WarmupConfig
=
optimization
.
WarmupConfig
(
type
=
"polynomial"
,
polynomial
=
PolynomialWarmupConfig
(
warmup_steps
=
10000
))
@
exp_factory
.
register_config_factory
(
'roformer/pretraining'
)
def
roformer_pretraining
()
->
cfg
.
ExperimentConfig
:
"""BERT pretraining experiment."""
config
=
cfg
.
ExperimentConfig
(
runtime
=
cfg
.
RuntimeConfig
(
enable_xla
=
True
),
task
=
masked_lm
.
MaskedLMConfig
(
model
=
bert
.
PretrainerConfig
(
encoder
=
encoders
.
EncoderConfig
(
type
=
"any"
,
any
=
roformer
.
RoformerEncoderConfig
()),
cls_heads
=
[
bert
.
ClsHeadConfig
(
inner_dim
=
768
,
num_classes
=
2
,
dropout_rate
=
0.1
,
name
=
'next_sentence'
)
]
),
train_data
=
pretrain_dataloader
.
BertPretrainDataConfig
(
use_v2_feature_names
=
True
),
validation_data
=
pretrain_dataloader
.
BertPretrainDataConfig
(
use_v2_feature_names
=
True
,
is_training
=
False
)),
trainer
=
cfg
.
TrainerConfig
(
optimizer_config
=
RoformerOptimizationConfig
(),
train_steps
=
1000000
),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
@
exp_factory
.
register_config_factory
(
'roformer/glue'
)
def
roformer_glue
()
->
cfg
.
ExperimentConfig
:
r
"""BigBird GLUE."""
config
=
cfg
.
ExperimentConfig
(
task
=
sentence_prediction
.
SentencePredictionConfig
(
model
=
sentence_prediction
.
ModelConfig
(
encoder
=
encoders
.
EncoderConfig
(
type
=
"any"
,
any
=
roformer
.
RoformerEncoderConfig
())),
train_data
=
sentence_prediction_dataloader
.
SentencePredictionDataConfig
(),
validation_data
=
sentence_prediction_dataloader
.
SentencePredictionDataConfig
(
is_training
=
False
,
drop_remainder
=
False
)),
trainer
=
cfg
.
TrainerConfig
(
optimizer_config
=
optimization
.
OptimizationConfig
({
'optimizer'
:
{
'type'
:
'adamw'
,
'adamw'
:
{
'weight_decay_rate'
:
0.01
,
'exclude_from_weight_decay'
:
[
'LayerNorm'
,
'layer_norm'
,
'bias'
],
}
},
'learning_rate'
:
{
'type'
:
'polynomial'
,
'polynomial'
:
{
'initial_learning_rate'
:
3e-5
,
'end_learning_rate'
:
0.0
,
}
},
'warmup'
:
{
'type'
:
'polynomial'
}
})),
restrictions
=
[
'task.train_data.is_training != None'
,
'task.validation_data.is_training != None'
])
return
config
official/projects/roformer/train.py
0 → 100644
View file @
8be7de91
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A customized training library for the specific task."""
from
absl
import
app
from
absl
import
flags
import
gin
from
official.common
import
distribute_utils
from
official.common
import
flags
as
tfm_flags
from
official.core
import
task_factory
from
official.core
import
train_lib
from
official.core
import
train_utils
from
official.modeling
import
performance
import
roformer_experiments
FLAGS
=
flags
.
FLAGS
def
main
(
_
):
gin
.
parse_config_files_and_bindings
(
FLAGS
.
gin_file
,
FLAGS
.
gin_params
)
params
=
train_utils
.
parse_configuration
(
FLAGS
)
model_dir
=
FLAGS
.
model_dir
if
'train'
in
FLAGS
.
mode
:
# Pure eval modes do not output yaml files. Otherwise continuous eval job
# may race against the train job for writing the same file.
train_utils
.
serialize_config
(
params
,
model_dir
)
# Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
# can have significant impact on model speeds by utilizing float16 in case of
# GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
# dtype is float16
if
params
.
runtime
.
mixed_precision_dtype
:
performance
.
set_mixed_precision_policy
(
params
.
runtime
.
mixed_precision_dtype
)
distribution_strategy
=
distribute_utils
.
get_distribution_strategy
(
distribution_strategy
=
params
.
runtime
.
distribution_strategy
,
all_reduce_alg
=
params
.
runtime
.
all_reduce_alg
,
num_gpus
=
params
.
runtime
.
num_gpus
,
tpu_address
=
params
.
runtime
.
tpu
,
**
params
.
runtime
.
model_parallelism
())
with
distribution_strategy
.
scope
():
task
=
task_factory
.
get_task
(
params
.
task
,
logging_dir
=
model_dir
)
train_lib
.
run_experiment
(
distribution_strategy
=
distribution_strategy
,
task
=
task
,
mode
=
FLAGS
.
mode
,
params
=
params
,
model_dir
=
model_dir
)
train_utils
.
save_gin_config
(
FLAGS
.
mode
,
model_dir
)
if
__name__
==
'__main__'
:
tfm_flags
.
define_flags
()
app
.
run
(
main
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment