Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
002b4240
Commit
002b4240
authored
Oct 07, 2021
by
Frederick Liu
Committed by
A. Unique TensorFlower
Oct 07, 2021
Browse files
[keras_nlp] Merge keras_nlp into tf_nlp.
PiperOrigin-RevId: 401593694
parent
03c096ab
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
471 additions
and
22 deletions
+471
-22
official/nlp/modeling/networks/bert_encoder.py
official/nlp/modeling/networks/bert_encoder.py
+251
-4
official/nlp/modeling/networks/bert_encoder_test.py
official/nlp/modeling/networks/bert_encoder_test.py
+207
-0
official/nlp/modeling/networks/encoder_scaffold.py
official/nlp/modeling/networks/encoder_scaffold.py
+4
-5
official/nlp/modeling/networks/encoder_scaffold_test.py
official/nlp/modeling/networks/encoder_scaffold_test.py
+1
-2
official/nlp/modeling/networks/funnel_transformer.py
official/nlp/modeling/networks/funnel_transformer.py
+6
-6
official/nlp/modeling/networks/mobile_bert_encoder.py
official/nlp/modeling/networks/mobile_bert_encoder.py
+1
-3
official/nlp/modeling/networks/packed_sequence_embedding.py
official/nlp/modeling/networks/packed_sequence_embedding.py
+1
-2
No files found.
official/nlp/modeling/networks/bert_encoder.py
View file @
002b4240
...
@@ -20,14 +20,261 @@ from absl import logging
...
@@ -20,14 +20,261 @@ from absl import logging
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.modeling
import
activations
from
official.modeling
import
activations
from
official.nlp
import
keras_nlp
from
official.nlp
.modeling
import
layers
# This class is being replaced by keras_nlp.encoders.BertEncoder and merely
# TODO(b/202413395): Merge V2 and V1.
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
BertEncoderV2
(
tf
.
keras
.
Model
):
"""Bi-directional Transformer-based encoder network.
This network implements a bi-directional Transformer-based encoder as
described in "BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
embedding lookups and transformer layers, but not the masked language model
or classification task networks.
The default values for this object are taken from the BERT-Base implementation
in "BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding".
*Note* that the network is constructed by
[Keras Functional API](https://keras.io/guides/functional_api/).
Args:
vocab_size: The size of the token vocabulary.
hidden_size: The size of the transformer hidden layers.
num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
type_vocab_size: The number of types that the 'type_ids' input can take.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network for each transformer.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network for each transformer.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: The dropout rate to use for the attention layers
within the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yields the full
output.
embedding_width: The width of the word embeddings. If the embedding width is
not equal to hidden size, embedding parameters will be factorized into two
matrices in the shape of ['vocab_size', 'embedding_width'] and
['embedding_width', 'hidden_size'] ('embedding_width' is usually much
smaller than 'hidden_size').
embedding_layer: An optional Layer instance which will be called to
generate embeddings for the input word IDs.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
"""
def
__init__
(
self
,
vocab_size
,
hidden_size
=
768
,
num_layers
=
12
,
num_attention_heads
=
12
,
max_sequence_length
=
512
,
type_vocab_size
=
16
,
inner_dim
=
3072
,
inner_activation
=
lambda
x
:
tf
.
keras
.
activations
.
gelu
(
x
,
approximate
=
True
),
output_dropout
=
0.1
,
attention_dropout
=
0.1
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
output_range
=
None
,
embedding_width
=
None
,
embedding_layer
=
None
,
norm_first
=
False
,
**
kwargs
):
activation
=
tf
.
keras
.
activations
.
get
(
inner_activation
)
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
word_ids
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
'input_word_ids'
)
mask
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
'input_mask'
)
type_ids
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
'input_type_ids'
)
if
embedding_width
is
None
:
embedding_width
=
hidden_size
if
embedding_layer
is
None
:
embedding_layer_inst
=
layers
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
name
=
'word_embeddings'
)
else
:
embedding_layer_inst
=
embedding_layer
word_embeddings
=
embedding_layer_inst
(
word_ids
)
# Always uses dynamic slicing for simplicity.
position_embedding_layer
=
layers
.
PositionEmbedding
(
initializer
=
initializer
,
max_length
=
max_sequence_length
,
name
=
'position_embedding'
)
position_embeddings
=
position_embedding_layer
(
word_embeddings
)
type_embedding_layer
=
layers
.
OnDeviceEmbedding
(
vocab_size
=
type_vocab_size
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
use_one_hot
=
True
,
name
=
'type_embeddings'
)
type_embeddings
=
type_embedding_layer
(
type_ids
)
embeddings
=
tf
.
keras
.
layers
.
Add
()(
[
word_embeddings
,
position_embeddings
,
type_embeddings
])
embedding_norm_layer
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
'embeddings/layer_norm'
,
axis
=-
1
,
epsilon
=
1e-12
,
dtype
=
tf
.
float32
)
embeddings
=
embedding_norm_layer
(
embeddings
)
embeddings
=
(
tf
.
keras
.
layers
.
Dropout
(
rate
=
output_dropout
)(
embeddings
))
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
if
embedding_width
!=
hidden_size
:
embedding_projection
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
'...x,xy->...y'
,
output_shape
=
hidden_size
,
bias_axes
=
'y'
,
kernel_initializer
=
initializer
,
name
=
'embedding_projection'
)
embeddings
=
embedding_projection
(
embeddings
)
else
:
embedding_projection
=
None
transformer_layers
=
[]
data
=
embeddings
attention_mask
=
layers
.
SelfAttentionMask
()(
data
,
mask
)
encoder_outputs
=
[]
for
i
in
range
(
num_layers
):
if
i
==
num_layers
-
1
and
output_range
is
not
None
:
transformer_output_range
=
output_range
else
:
transformer_output_range
=
None
layer
=
layers
.
TransformerEncoderBlock
(
num_attention_heads
=
num_attention_heads
,
inner_dim
=
inner_dim
,
inner_activation
=
inner_activation
,
output_dropout
=
output_dropout
,
attention_dropout
=
attention_dropout
,
norm_first
=
norm_first
,
output_range
=
transformer_output_range
,
kernel_initializer
=
initializer
,
name
=
'transformer/layer_%d'
%
i
)
transformer_layers
.
append
(
layer
)
data
=
layer
([
data
,
attention_mask
])
encoder_outputs
.
append
(
data
)
last_encoder_output
=
encoder_outputs
[
-
1
]
# Applying a tf.slice op (through subscript notation) to a Keras tensor
# like this will create a SliceOpLambda layer. This is better than a Lambda
# layer with Python code, because that is fundamentally less portable.
first_token_tensor
=
last_encoder_output
[:,
0
,
:]
pooler_layer
=
tf
.
keras
.
layers
.
Dense
(
units
=
hidden_size
,
activation
=
'tanh'
,
kernel_initializer
=
initializer
,
name
=
'pooler_transform'
)
cls_output
=
pooler_layer
(
first_token_tensor
)
outputs
=
dict
(
sequence_output
=
encoder_outputs
[
-
1
],
pooled_output
=
cls_output
,
encoder_outputs
=
encoder_outputs
,
)
# Once we've created the network using the Functional API, we call
# super().__init__ as though we were invoking the Functional API Model
# constructor, resulting in this object having all the properties of a model
# created using the Functional API. Once super().__init__ is called, we
# can assign attributes to `self` - note that all `self` assignments are
# below this line.
super
(
BertEncoderV2
,
self
).
__init__
(
inputs
=
[
word_ids
,
mask
,
type_ids
],
outputs
=
outputs
,
**
kwargs
)
config_dict
=
{
'vocab_size'
:
vocab_size
,
'hidden_size'
:
hidden_size
,
'num_layers'
:
num_layers
,
'num_attention_heads'
:
num_attention_heads
,
'max_sequence_length'
:
max_sequence_length
,
'type_vocab_size'
:
type_vocab_size
,
'inner_dim'
:
inner_dim
,
'inner_activation'
:
tf
.
keras
.
activations
.
serialize
(
activation
),
'output_dropout'
:
output_dropout
,
'attention_dropout'
:
attention_dropout
,
'initializer'
:
tf
.
keras
.
initializers
.
serialize
(
initializer
),
'output_range'
:
output_range
,
'embedding_width'
:
embedding_width
,
'embedding_layer'
:
embedding_layer
,
'norm_first'
:
norm_first
,
}
# We are storing the config dict as a namedtuple here to ensure checkpoint
# compatibility with an earlier version of this model which did not track
# the config dict attribute. TF does not track immutable attrs which
# do not contain Trackables, so by creating a config namedtuple instead of
# a dict we avoid tracking it.
config_cls
=
collections
.
namedtuple
(
'Config'
,
config_dict
.
keys
())
self
.
_config
=
config_cls
(
**
config_dict
)
self
.
_pooler_layer
=
pooler_layer
self
.
_transformer_layers
=
transformer_layers
self
.
_embedding_norm_layer
=
embedding_norm_layer
self
.
_embedding_layer
=
embedding_layer_inst
self
.
_position_embedding_layer
=
position_embedding_layer
self
.
_type_embedding_layer
=
type_embedding_layer
if
embedding_projection
is
not
None
:
self
.
_embedding_projection
=
embedding_projection
def
get_embedding_table
(
self
):
return
self
.
_embedding_layer
.
embeddings
def
get_embedding_layer
(
self
):
return
self
.
_embedding_layer
def
get_config
(
self
):
return
dict
(
self
.
_config
.
_asdict
())
@
property
def
transformer_layers
(
self
):
"""List of Transformer layers in the encoder."""
return
self
.
_transformer_layers
@
property
def
pooler_layer
(
self
):
"""The pooler dense layer after the transformer layers."""
return
self
.
_pooler_layer
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
if
'embedding_layer'
in
config
and
config
[
'embedding_layer'
]
is
not
None
:
warn_string
=
(
'You are reloading a model that was saved with a '
'potentially-shared embedding layer object. If you contine to '
'train this model, the embedding layer will no longer be shared. '
'To work around this, load the model outside of the Keras API.'
)
print
(
'WARNING: '
+
warn_string
)
logging
.
warn
(
warn_string
)
return
cls
(
**
config
)
# This class is being replaced by BertEncoderV2 and merely
# acts as a wrapper if you need: 1) list outputs instead of dict outputs,
# acts as a wrapper if you need: 1) list outputs instead of dict outputs,
# 2) shared embedding layer.
# 2) shared embedding layer.
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'Text'
)
class
BertEncoder
(
keras_nlp
.
encoders
.
BertEncoder
):
class
BertEncoder
(
BertEncoder
V2
):
"""Bi-directional Transformer-based encoder network.
"""Bi-directional Transformer-based encoder network.
This network implements a bi-directional Transformer-based encoder as
This network implements a bi-directional Transformer-based encoder as
...
@@ -155,5 +402,5 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
...
@@ -155,5 +402,5 @@ class BertEncoder(keras_nlp.encoders.BertEncoder):
else
:
else
:
sequence_output
=
nested_output
[
'sequence_output'
]
sequence_output
=
nested_output
[
'sequence_output'
]
outputs
=
[
sequence_output
,
cls_output
]
outputs
=
[
sequence_output
,
cls_output
]
super
(
keras_nlp
.
encoders
.
BertEncoder
,
self
).
__init__
(
super
(
BertEncoder
V2
,
self
).
__init__
(
# pylint: disable=bad-super-call
inputs
=
self
.
inputs
,
outputs
=
outputs
,
**
kwargs
)
inputs
=
self
.
inputs
,
outputs
=
outputs
,
**
kwargs
)
official/nlp/modeling/networks/bert_encoder_test.py
View file @
002b4240
...
@@ -23,6 +23,213 @@ from tensorflow.python.keras import keras_parameterized # pylint: disable=g-dir
...
@@ -23,6 +23,213 @@ from tensorflow.python.keras import keras_parameterized # pylint: disable=g-dir
from
official.nlp.modeling.networks
import
bert_encoder
from
official.nlp.modeling.networks
import
bert_encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@
keras_parameterized
.
run_all_keras_modes
class
BertEncoderV2Test
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
BertEncoderV2Test
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
set_global_policy
(
"float32"
)
def
test_network_creation
(
self
):
hidden_size
=
32
sequence_length
=
21
# Create a small BertEncoder for testing.
test_network
=
bert_encoder
.
BertEncoderV2
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
self
.
assertIsInstance
(
test_network
.
transformer_layers
,
list
)
self
.
assertLen
(
test_network
.
transformer_layers
,
3
)
self
.
assertIsInstance
(
test_network
.
pooler_layer
,
tf
.
keras
.
layers
.
Dense
)
expected_data_shape
=
[
None
,
sequence_length
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# The default output dtype is float32.
self
.
assertAllEqual
(
tf
.
float32
,
data
.
dtype
)
self
.
assertAllEqual
(
tf
.
float32
,
pooled
.
dtype
)
def
test_all_encoder_outputs_network_creation
(
self
):
hidden_size
=
32
sequence_length
=
21
# Create a small BertEncoder for testing.
test_network
=
bert_encoder
.
BertEncoderV2
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
all_encoder_outputs
=
dict_outputs
[
"encoder_outputs"
]
pooled
=
dict_outputs
[
"pooled_output"
]
expected_data_shape
=
[
None
,
sequence_length
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertLen
(
all_encoder_outputs
,
3
)
for
data
in
all_encoder_outputs
:
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# The default output dtype is float32.
self
.
assertAllEqual
(
tf
.
float32
,
all_encoder_outputs
[
-
1
].
dtype
)
self
.
assertAllEqual
(
tf
.
float32
,
pooled
.
dtype
)
def
test_network_creation_with_float16_dtype
(
self
):
hidden_size
=
32
sequence_length
=
21
tf
.
keras
.
mixed_precision
.
set_global_policy
(
"mixed_float16"
)
# Create a small BertEncoder for testing.
test_network
=
bert_encoder
.
BertEncoderV2
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
expected_data_shape
=
[
None
,
sequence_length
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self
.
assertAllEqual
(
tf
.
float32
,
data
.
dtype
)
self
.
assertAllEqual
(
tf
.
float16
,
pooled
.
dtype
)
@
parameterized
.
named_parameters
(
(
"all_sequence"
,
None
,
21
),
(
"output_range"
,
1
,
1
),
)
def
test_network_invocation
(
self
,
output_range
,
out_seq_len
):
hidden_size
=
32
sequence_length
=
21
vocab_size
=
57
num_types
=
7
# Create a small BertEncoder for testing.
test_network
=
bert_encoder
.
BertEncoderV2
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
,
output_range
=
output_range
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
# Create a model based off of this network:
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
# Invoke the model. We can't validate the output data here (the model is too
# complex) but this will catch structural runtime errors.
batch_size
=
3
word_id_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
))
type_id_data
=
np
.
random
.
randint
(
num_types
,
size
=
(
batch_size
,
sequence_length
))
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
1
],
out_seq_len
)
# Creates a BertEncoder with max_sequence_length != sequence_length
max_sequence_length
=
128
test_network
=
bert_encoder
.
BertEncoderV2
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
max_sequence_length
=
max_sequence_length
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
1
],
sequence_length
)
# Creates a BertEncoder with embedding_width != hidden_size
test_network
=
bert_encoder
.
BertEncoderV2
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
max_sequence_length
=
max_sequence_length
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
,
embedding_width
=
16
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
-
1
],
hidden_size
)
self
.
assertTrue
(
hasattr
(
test_network
,
"_embedding_projection"
))
def
test_serialize_deserialize
(
self
):
# Create a network object that sets all of its config options.
kwargs
=
dict
(
vocab_size
=
100
,
hidden_size
=
32
,
num_layers
=
3
,
num_attention_heads
=
2
,
max_sequence_length
=
21
,
type_vocab_size
=
12
,
inner_dim
=
1223
,
inner_activation
=
"relu"
,
output_dropout
=
0.05
,
attention_dropout
=
0.22
,
initializer
=
"glorot_uniform"
,
output_range
=-
1
,
embedding_width
=
16
,
embedding_layer
=
None
,
norm_first
=
False
)
network
=
bert_encoder
.
BertEncoderV2
(
**
kwargs
)
expected_config
=
dict
(
kwargs
)
expected_config
[
"inner_activation"
]
=
tf
.
keras
.
activations
.
serialize
(
tf
.
keras
.
activations
.
get
(
expected_config
[
"inner_activation"
]))
expected_config
[
"initializer"
]
=
tf
.
keras
.
initializers
.
serialize
(
tf
.
keras
.
initializers
.
get
(
expected_config
[
"initializer"
]))
self
.
assertEqual
(
network
.
get_config
(),
expected_config
)
# Create another network object from the first object's config.
new_network
=
bert_encoder
.
BertEncoderV2
.
from_config
(
network
.
get_config
())
# Validate that the config can be forced to JSON.
_
=
network
.
to_json
()
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
network
.
get_config
(),
new_network
.
get_config
())
# Tests model saving/loading.
model_path
=
self
.
get_temp_dir
()
+
"/model"
network
.
save
(
model_path
)
_
=
tf
.
keras
.
models
.
load_model
(
model_path
)
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
# guarantees forward compatibility of this code for the V2 switchover.
@
keras_parameterized
.
run_all_keras_modes
@
keras_parameterized
.
run_all_keras_modes
...
...
official/nlp/modeling/networks/encoder_scaffold.py
View file @
002b4240
...
@@ -21,7 +21,6 @@ from absl import logging
...
@@ -21,7 +21,6 @@ from absl import logging
import
gin
import
gin
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp
import
keras_nlp
from
official.nlp.modeling
import
layers
from
official.nlp.modeling
import
layers
...
@@ -115,7 +114,7 @@ class EncoderScaffold(tf.keras.Model):
...
@@ -115,7 +114,7 @@ class EncoderScaffold(tf.keras.Model):
num_hidden_instances
=
1
,
num_hidden_instances
=
1
,
hidden_cls
=
layers
.
Transformer
,
hidden_cls
=
layers
.
Transformer
,
hidden_cfg
=
None
,
hidden_cfg
=
None
,
mask_cls
=
keras_nlp
.
layers
.
SelfAttentionMask
,
mask_cls
=
layers
.
SelfAttentionMask
,
mask_cfg
=
None
,
mask_cfg
=
None
,
layer_norm_before_pooling
=
False
,
layer_norm_before_pooling
=
False
,
return_all_layer_outputs
=
False
,
return_all_layer_outputs
=
False
,
...
@@ -146,7 +145,7 @@ class EncoderScaffold(tf.keras.Model):
...
@@ -146,7 +145,7 @@ class EncoderScaffold(tf.keras.Model):
shape
=
(
seq_length
,),
dtype
=
tf
.
int32
,
name
=
'input_type_ids'
)
shape
=
(
seq_length
,),
dtype
=
tf
.
int32
,
name
=
'input_type_ids'
)
inputs
=
[
word_ids
,
mask
,
type_ids
]
inputs
=
[
word_ids
,
mask
,
type_ids
]
embedding_layer
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
embedding_layer
=
layers
.
OnDeviceEmbedding
(
vocab_size
=
embedding_cfg
[
'vocab_size'
],
vocab_size
=
embedding_cfg
[
'vocab_size'
],
embedding_width
=
embedding_cfg
[
'hidden_size'
],
embedding_width
=
embedding_cfg
[
'hidden_size'
],
initializer
=
embedding_cfg
[
'initializer'
],
initializer
=
embedding_cfg
[
'initializer'
],
...
@@ -155,13 +154,13 @@ class EncoderScaffold(tf.keras.Model):
...
@@ -155,13 +154,13 @@ class EncoderScaffold(tf.keras.Model):
word_embeddings
=
embedding_layer
(
word_ids
)
word_embeddings
=
embedding_layer
(
word_ids
)
# Always uses dynamic slicing for simplicity.
# Always uses dynamic slicing for simplicity.
position_embedding_layer
=
keras_nlp
.
layers
.
PositionEmbedding
(
position_embedding_layer
=
layers
.
PositionEmbedding
(
initializer
=
embedding_cfg
[
'initializer'
],
initializer
=
embedding_cfg
[
'initializer'
],
max_length
=
embedding_cfg
[
'max_seq_length'
],
max_length
=
embedding_cfg
[
'max_seq_length'
],
name
=
'position_embedding'
)
name
=
'position_embedding'
)
position_embeddings
=
position_embedding_layer
(
word_embeddings
)
position_embeddings
=
position_embedding_layer
(
word_embeddings
)
type_embedding_layer
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
type_embedding_layer
=
layers
.
OnDeviceEmbedding
(
vocab_size
=
embedding_cfg
[
'type_vocab_size'
],
vocab_size
=
embedding_cfg
[
'type_vocab_size'
],
embedding_width
=
embedding_cfg
[
'hidden_size'
],
embedding_width
=
embedding_cfg
[
'hidden_size'
],
initializer
=
embedding_cfg
[
'initializer'
],
initializer
=
embedding_cfg
[
'initializer'
],
...
...
official/nlp/modeling/networks/encoder_scaffold_test.py
View file @
002b4240
...
@@ -20,7 +20,6 @@ import tensorflow as tf
...
@@ -20,7 +20,6 @@ import tensorflow as tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.modeling
import
activations
from
official.modeling
import
activations
from
official.nlp
import
keras_nlp
from
official.nlp.modeling
import
layers
from
official.nlp.modeling
import
layers
from
official.nlp.modeling.networks
import
encoder_scaffold
from
official.nlp.modeling.networks
import
encoder_scaffold
...
@@ -54,7 +53,7 @@ class ValidatedTransformerLayer(layers.Transformer):
...
@@ -54,7 +53,7 @@ class ValidatedTransformerLayer(layers.Transformer):
# boolean 'True'. We register this class as a Keras serializable so we can
# boolean 'True'. We register this class as a Keras serializable so we can
# test serialization below.
# test serialization below.
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"TestOnly"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"TestOnly"
)
class
ValidatedMaskLayer
(
keras_nlp
.
layers
.
SelfAttentionMask
):
class
ValidatedMaskLayer
(
layers
.
SelfAttentionMask
):
def
__init__
(
self
,
call_list
,
call_class
=
None
,
**
kwargs
):
def
__init__
(
self
,
call_list
,
call_class
=
None
,
**
kwargs
):
super
(
ValidatedMaskLayer
,
self
).
__init__
(
**
kwargs
)
super
(
ValidatedMaskLayer
,
self
).
__init__
(
**
kwargs
)
...
...
official/nlp/modeling/networks/funnel_transformer.py
View file @
002b4240
...
@@ -19,7 +19,7 @@ from absl import logging
...
@@ -19,7 +19,7 @@ from absl import logging
import
numpy
as
np
import
numpy
as
np
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp
import
keras_nlp
from
official.nlp
.modeling
import
layers
def
_pool_and_concat
(
mask
,
unpool_length
:
int
,
strides
:
Union
[
Sequence
[
int
],
def
_pool_and_concat
(
mask
,
unpool_length
:
int
,
strides
:
Union
[
Sequence
[
int
],
...
@@ -139,7 +139,7 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
...
@@ -139,7 +139,7 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
embedding_width
=
hidden_size
embedding_width
=
hidden_size
if
embedding_layer
is
None
:
if
embedding_layer
is
None
:
self
.
_embedding_layer
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
self
.
_embedding_layer
=
layers
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
initializer
=
initializer
,
...
@@ -147,12 +147,12 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
...
@@ -147,12 +147,12 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
else
:
else
:
self
.
_embedding_layer
=
embedding_layer
self
.
_embedding_layer
=
embedding_layer
self
.
_position_embedding_layer
=
keras_nlp
.
layers
.
PositionEmbedding
(
self
.
_position_embedding_layer
=
layers
.
PositionEmbedding
(
initializer
=
initializer
,
initializer
=
initializer
,
max_length
=
max_sequence_length
,
max_length
=
max_sequence_length
,
name
=
'position_embedding'
)
name
=
'position_embedding'
)
self
.
_type_embedding_layer
=
keras_nlp
.
layers
.
OnDeviceEmbedding
(
self
.
_type_embedding_layer
=
layers
.
OnDeviceEmbedding
(
vocab_size
=
type_vocab_size
,
vocab_size
=
type_vocab_size
,
embedding_width
=
embedding_width
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
initializer
=
initializer
,
...
@@ -177,10 +177,10 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
...
@@ -177,10 +177,10 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
name
=
'embedding_projection'
)
name
=
'embedding_projection'
)
self
.
_transformer_layers
=
[]
self
.
_transformer_layers
=
[]
self
.
_attention_mask_layer
=
keras_nlp
.
layers
.
SelfAttentionMask
(
self
.
_attention_mask_layer
=
layers
.
SelfAttentionMask
(
name
=
'self_attention_mask'
)
name
=
'self_attention_mask'
)
for
i
in
range
(
num_layers
):
for
i
in
range
(
num_layers
):
layer
=
keras_nlp
.
layers
.
TransformerEncoderBlock
(
layer
=
layers
.
TransformerEncoderBlock
(
num_attention_heads
=
num_attention_heads
,
num_attention_heads
=
num_attention_heads
,
inner_dim
=
inner_dim
,
inner_dim
=
inner_dim
,
inner_activation
=
inner_activation
,
inner_activation
=
inner_activation
,
...
...
official/nlp/modeling/networks/mobile_bert_encoder.py
View file @
002b4240
...
@@ -16,7 +16,6 @@
...
@@ -16,7 +16,6 @@
import
gin
import
gin
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.nlp
import
keras_nlp
from
official.nlp.modeling
import
layers
from
official.nlp.modeling
import
layers
...
@@ -127,8 +126,7 @@ class MobileBERTEncoder(tf.keras.Model):
...
@@ -127,8 +126,7 @@ class MobileBERTEncoder(tf.keras.Model):
self
.
inputs
=
[
input_ids
,
input_mask
,
type_ids
]
self
.
inputs
=
[
input_ids
,
input_mask
,
type_ids
]
# The dtype of `attention_mask` will the same as the dtype of `input_mask`.
# The dtype of `attention_mask` will the same as the dtype of `input_mask`.
attention_mask
=
keras_nlp
.
layers
.
SelfAttentionMask
()(
input_mask
,
attention_mask
=
layers
.
SelfAttentionMask
()(
input_mask
,
input_mask
)
input_mask
)
# build the computation graph
# build the computation graph
all_layer_outputs
=
[]
all_layer_outputs
=
[]
...
...
official/nlp/modeling/networks/packed_sequence_embedding.py
View file @
002b4240
...
@@ -18,7 +18,6 @@ import collections
...
@@ -18,7 +18,6 @@ import collections
import
tensorflow
as
tf
import
tensorflow
as
tf
from
official.modeling
import
tf_utils
from
official.modeling
import
tf_utils
from
official.nlp
import
keras_nlp
from
official.nlp.modeling
import
layers
from
official.nlp.modeling
import
layers
...
@@ -137,7 +136,7 @@ class PackedSequenceEmbedding(tf.keras.Model):
...
@@ -137,7 +136,7 @@ class PackedSequenceEmbedding(tf.keras.Model):
name
=
'embedding_projection'
)(
name
=
'embedding_projection'
)(
embeddings
)
embeddings
)
attention_mask
=
keras_nlp
.
layers
.
SelfAttentionMask
()(
embeddings
,
mask
)
attention_mask
=
layers
.
SelfAttentionMask
()(
embeddings
,
mask
)
if
sub_seq_mask
is
not
None
:
if
sub_seq_mask
is
not
None
:
attention_mask
=
tf
.
keras
.
layers
.
Lambda
(
attention_mask
=
tf
.
keras
.
layers
.
Lambda
(
lambda
x
:
x
[
0
]
*
tf
.
cast
(
x
[
1
],
x
[
0
].
dtype
))(
lambda
x
:
x
[
0
]
*
tf
.
cast
(
x
[
1
],
x
[
0
].
dtype
))(
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment