Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
b0ccdb11
Commit
b0ccdb11
authored
Sep 28, 2020
by
Shixin Luo
Browse files
resolve conflict with master
parents
e61588cd
1611a8c5
Changes
210
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
916 additions
and
64 deletions
+916
-64
official/nlp/bert/model_training_utils.py
official/nlp/bert/model_training_utils.py
+8
-6
official/nlp/bert/run_classifier.py
official/nlp/bert/run_classifier.py
+2
-2
official/nlp/bert/run_pretraining.py
official/nlp/bert/run_pretraining.py
+3
-4
official/nlp/bert/run_squad.py
official/nlp/bert/run_squad.py
+3
-5
official/nlp/bert/tf2_encoder_checkpoint_converter.py
official/nlp/bert/tf2_encoder_checkpoint_converter.py
+52
-10
official/nlp/bert/tokenization.py
official/nlp/bert/tokenization.py
+3
-3
official/nlp/configs/encoders.py
official/nlp/configs/encoders.py
+62
-23
official/nlp/keras_nlp/README.md
official/nlp/keras_nlp/README.md
+37
-0
official/nlp/keras_nlp/__init__.py
official/nlp/keras_nlp/__init__.py
+2
-1
official/nlp/keras_nlp/contributing.md
official/nlp/keras_nlp/contributing.md
+21
-0
official/nlp/keras_nlp/encoders/__init__.py
official/nlp/keras_nlp/encoders/__init__.py
+16
-0
official/nlp/keras_nlp/encoders/bert_encoder.py
official/nlp/keras_nlp/encoders/bert_encoder.py
+222
-0
official/nlp/keras_nlp/encoders/bert_encoder_test.py
official/nlp/keras_nlp/encoders/bert_encoder_test.py
+231
-0
official/nlp/keras_nlp/layers/__init__.py
official/nlp/keras_nlp/layers/__init__.py
+2
-0
official/nlp/keras_nlp/layers/masked_lm.py
official/nlp/keras_nlp/layers/masked_lm.py
+124
-0
official/nlp/keras_nlp/layers/on_device_embedding.py
official/nlp/keras_nlp/layers/on_device_embedding.py
+92
-0
official/nlp/keras_nlp/layers/on_device_embedding_test.py
official/nlp/keras_nlp/layers/on_device_embedding_test.py
+3
-2
official/nlp/keras_nlp/layers/position_embedding.py
official/nlp/keras_nlp/layers/position_embedding.py
+1
-1
official/nlp/keras_nlp/layers/transformer_encoder_block.py
official/nlp/keras_nlp/layers/transformer_encoder_block.py
+4
-2
official/nlp/keras_nlp/layers/transformer_encoder_block_test.py
...al/nlp/keras_nlp/layers/transformer_encoder_block_test.py
+28
-5
No files found.
official/nlp/bert/model_training_utils.py
View file @
b0ccdb11
...
...
@@ -25,8 +25,8 @@ import tempfile
from
absl
import
logging
import
tensorflow
as
tf
from
tensorflow.python.util
import
deprecation
from
official.common
import
distribute_utils
from
official.staging.training
import
grad_utils
from
official.utils.misc
import
distribution_utils
_SUMMARY_TXT
=
'training_summary.txt'
_MIN_SUMMARY_STEPS
=
10
...
...
@@ -164,9 +164,9 @@ def run_customized_training_loop(
evaluation is skipped.
eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
is not none.
metric_fn: A metrics function that returns a Keras Metric object
to rec
or
d
evaluation result using evaluation dataset or with training dataset
after every epoch.
metric_fn: A metrics function that returns
either
a Keras Metric object or
a list of Keras Metric objects to record evaluation result using
evaluation dataset or with training dataset
after every epoch.
init_checkpoint: Optional checkpoint to load to `sub_model` returned by
`model_fn`.
custom_callbacks: A list of Keras Callbacks objects to run during
...
...
@@ -266,7 +266,7 @@ def run_customized_training_loop(
train_iterator
=
_get_input_iterator
(
train_input_fn
,
strategy
)
eval_loss_metric
=
tf
.
keras
.
metrics
.
Mean
(
'training_loss'
,
dtype
=
tf
.
float32
)
with
distribut
ion
_utils
.
get_strategy_scope
(
strategy
):
with
distribut
e
_utils
.
get_strategy_scope
(
strategy
):
# To correctly place the model weights on accelerators,
# model and optimizer should be created in scope.
model
,
sub_model
=
model_fn
()
...
...
@@ -291,7 +291,9 @@ def run_customized_training_loop(
logging
.
info
(
'Loading from checkpoint file completed'
)
train_loss_metric
=
tf
.
keras
.
metrics
.
Mean
(
'training_loss'
,
dtype
=
tf
.
float32
)
eval_metrics
=
[
metric_fn
()]
if
metric_fn
else
[]
eval_metrics
=
metric_fn
()
if
metric_fn
else
[]
if
not
isinstance
(
eval_metrics
,
list
):
eval_metrics
=
[
eval_metrics
]
# If evaluation is required, make a copy of metric as it will be used by
# both train and evaluation.
train_metrics
=
[
...
...
official/nlp/bert/run_classifier.py
View file @
b0ccdb11
...
...
@@ -28,6 +28,7 @@ from absl import flags
from
absl
import
logging
import
gin
import
tensorflow
as
tf
from
official.common
import
distribute_utils
from
official.modeling
import
performance
from
official.nlp
import
optimization
from
official.nlp.bert
import
bert_models
...
...
@@ -35,7 +36,6 @@ from official.nlp.bert import common_flags
from
official.nlp.bert
import
configs
as
bert_configs
from
official.nlp.bert
import
input_pipeline
from
official.nlp.bert
import
model_saving_utils
from
official.utils.misc
import
distribution_utils
from
official.utils.misc
import
keras_utils
flags
.
DEFINE_enum
(
...
...
@@ -447,7 +447,7 @@ def custom_main(custom_callbacks=None, custom_metrics=None):
FLAGS
.
model_dir
)
return
strategy
=
distribut
ion
_utils
.
get_distribution_strategy
(
strategy
=
distribut
e
_utils
.
get_distribution_strategy
(
distribution_strategy
=
FLAGS
.
distribution_strategy
,
num_gpus
=
FLAGS
.
num_gpus
,
tpu_address
=
FLAGS
.
tpu
)
...
...
official/nlp/bert/run_pretraining.py
View file @
b0ccdb11
...
...
@@ -23,6 +23,7 @@ from absl import flags
from
absl
import
logging
import
gin
import
tensorflow
as
tf
from
official.common
import
distribute_utils
from
official.modeling
import
performance
from
official.nlp
import
optimization
from
official.nlp.bert
import
bert_models
...
...
@@ -30,7 +31,6 @@ from official.nlp.bert import common_flags
from
official.nlp.bert
import
configs
from
official.nlp.bert
import
input_pipeline
from
official.nlp.bert
import
model_training_utils
from
official.utils.misc
import
distribution_utils
flags
.
DEFINE_string
(
'input_files'
,
None
,
...
...
@@ -205,9 +205,8 @@ def main(_):
FLAGS
.
model_dir
=
'/tmp/bert20/'
# Configures cluster spec for multi-worker distribution strategy.
if
FLAGS
.
num_gpus
>
0
:
_
=
distribution_utils
.
configure_cluster
(
FLAGS
.
worker_hosts
,
FLAGS
.
task_index
)
strategy
=
distribution_utils
.
get_distribution_strategy
(
_
=
distribute_utils
.
configure_cluster
(
FLAGS
.
worker_hosts
,
FLAGS
.
task_index
)
strategy
=
distribute_utils
.
get_distribution_strategy
(
distribution_strategy
=
FLAGS
.
distribution_strategy
,
num_gpus
=
FLAGS
.
num_gpus
,
all_reduce_alg
=
FLAGS
.
all_reduce_alg
,
...
...
official/nlp/bert/run_squad.py
View file @
b0ccdb11
...
...
@@ -28,12 +28,11 @@ from absl import flags
from
absl
import
logging
import
gin
import
tensorflow
as
tf
from
official.common
import
distribute_utils
from
official.nlp.bert
import
configs
as
bert_configs
from
official.nlp.bert
import
run_squad_helper
from
official.nlp.bert
import
tokenization
from
official.nlp.data
import
squad_lib
as
squad_lib_wp
from
official.utils.misc
import
distribution_utils
from
official.utils.misc
import
keras_utils
...
...
@@ -105,9 +104,8 @@ def main(_):
# Configures cluster spec for multi-worker distribution strategy.
if
FLAGS
.
num_gpus
>
0
:
_
=
distribution_utils
.
configure_cluster
(
FLAGS
.
worker_hosts
,
FLAGS
.
task_index
)
strategy
=
distribution_utils
.
get_distribution_strategy
(
_
=
distribute_utils
.
configure_cluster
(
FLAGS
.
worker_hosts
,
FLAGS
.
task_index
)
strategy
=
distribute_utils
.
get_distribution_strategy
(
distribution_strategy
=
FLAGS
.
distribution_strategy
,
num_gpus
=
FLAGS
.
num_gpus
,
all_reduce_alg
=
FLAGS
.
all_reduce_alg
,
...
...
official/nlp/bert/tf2_encoder_checkpoint_converter.py
View file @
b0ccdb11
...
...
@@ -15,7 +15,8 @@
"""A converter from a V1 BERT encoder checkpoint to a V2 encoder checkpoint.
The conversion will yield an object-oriented checkpoint that can be used
to restore a TransformerEncoder object.
to restore a BertEncoder or BertPretrainerV2 object (see the `converted_model`
FLAG below).
"""
from
__future__
import
absolute_import
from
__future__
import
division
...
...
@@ -27,9 +28,10 @@ from absl import app
from
absl
import
flags
import
tensorflow
as
tf
from
official.modeling
import
activation
s
from
official.modeling
import
tf_util
s
from
official.nlp.bert
import
configs
from
official.nlp.bert
import
tf1_checkpoint_converter_lib
from
official.nlp.modeling
import
models
from
official.nlp.modeling
import
networks
FLAGS
=
flags
.
FLAGS
...
...
@@ -46,6 +48,10 @@ flags.DEFINE_string("checkpoint_model_name", "encoder",
"The name of the model when saving the checkpoint, i.e., "
"the checkpoint will be saved using: "
"tf.train.Checkpoint(FLAGS.checkpoint_model_name=model)."
)
flags
.
DEFINE_enum
(
"converted_model"
,
"encoder"
,
[
"encoder"
,
"pretrainer"
],
"Whether to convert the checkpoint to a `BertEncoder` model or a "
"`BertPretrainerV2` model (with mlm but without classification heads)."
)
def
_create_bert_model
(
cfg
):
...
...
@@ -55,7 +61,7 @@ def _create_bert_model(cfg):
cfg: A `BertConfig` to create the core model.
Returns:
A
Transform
erEncoder net
o
work.
A
B
er
t
Encoder network.
"""
bert_encoder
=
networks
.
BertEncoder
(
vocab_size
=
cfg
.
vocab_size
,
...
...
@@ -63,7 +69,7 @@ def _create_bert_model(cfg):
num_layers
=
cfg
.
num_hidden_layers
,
num_attention_heads
=
cfg
.
num_attention_heads
,
intermediate_size
=
cfg
.
intermediate_size
,
activation
=
activations
.
gelu
,
activation
=
tf_utils
.
get_activation
(
cfg
.
hidden_act
)
,
dropout_rate
=
cfg
.
hidden_dropout_prob
,
attention_dropout_rate
=
cfg
.
attention_probs_dropout_prob
,
max_sequence_length
=
cfg
.
max_position_embeddings
,
...
...
@@ -75,8 +81,29 @@ def _create_bert_model(cfg):
return
bert_encoder
def
convert_checkpoint
(
bert_config
,
output_path
,
v1_checkpoint
,
checkpoint_model_name
=
"model"
):
def
_create_bert_pretrainer_model
(
cfg
):
"""Creates a BERT keras core model from BERT configuration.
Args:
cfg: A `BertConfig` to create the core model.
Returns:
A BertPretrainerV2 model.
"""
bert_encoder
=
_create_bert_model
(
cfg
)
pretrainer
=
models
.
BertPretrainerV2
(
encoder_network
=
bert_encoder
,
mlm_activation
=
tf_utils
.
get_activation
(
cfg
.
hidden_act
),
mlm_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
cfg
.
initializer_range
))
return
pretrainer
def
convert_checkpoint
(
bert_config
,
output_path
,
v1_checkpoint
,
checkpoint_model_name
=
"model"
,
converted_model
=
"encoder"
):
"""Converts a V1 checkpoint into an OO V2 checkpoint."""
output_dir
,
_
=
os
.
path
.
split
(
output_path
)
tf
.
io
.
gfile
.
makedirs
(
output_dir
)
...
...
@@ -84,6 +111,7 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint,
# Create a temporary V1 name-converted checkpoint in the output directory.
temporary_checkpoint_dir
=
os
.
path
.
join
(
output_dir
,
"temp_v1"
)
temporary_checkpoint
=
os
.
path
.
join
(
temporary_checkpoint_dir
,
"ckpt"
)
tf1_checkpoint_converter_lib
.
convert
(
checkpoint_from_path
=
v1_checkpoint
,
checkpoint_to_path
=
temporary_checkpoint
,
...
...
@@ -92,8 +120,14 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint,
permutations
=
tf1_checkpoint_converter_lib
.
BERT_V2_PERMUTATIONS
,
exclude_patterns
=
[
"adam"
,
"Adam"
])
if
converted_model
==
"encoder"
:
model
=
_create_bert_model
(
bert_config
)
elif
converted_model
==
"pretrainer"
:
model
=
_create_bert_pretrainer_model
(
bert_config
)
else
:
raise
ValueError
(
"Unsupported converted_model: %s"
%
converted_model
)
# Create a V2 checkpoint from the temporary checkpoint.
model
=
_create_bert_model
(
bert_config
)
tf1_checkpoint_converter_lib
.
create_v2_checkpoint
(
model
,
temporary_checkpoint
,
output_path
,
checkpoint_model_name
)
...
...
@@ -106,13 +140,21 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint,
pass
def
main
(
_
):
def
main
(
argv
):
if
len
(
argv
)
>
1
:
raise
app
.
UsageError
(
"Too many command-line arguments."
)
output_path
=
FLAGS
.
converted_checkpoint_path
v1_checkpoint
=
FLAGS
.
checkpoint_to_convert
checkpoint_model_name
=
FLAGS
.
checkpoint_model_name
converted_model
=
FLAGS
.
converted_model
bert_config
=
configs
.
BertConfig
.
from_json_file
(
FLAGS
.
bert_config_file
)
convert_checkpoint
(
bert_config
,
output_path
,
v1_checkpoint
,
checkpoint_model_name
)
convert_checkpoint
(
bert_config
=
bert_config
,
output_path
=
output_path
,
v1_checkpoint
=
v1_checkpoint
,
checkpoint_model_name
=
checkpoint_model_name
,
converted_model
=
converted_model
)
if
__name__
==
"__main__"
:
...
...
official/nlp/bert/tokenization.py
View file @
b0ccdb11
...
...
@@ -421,7 +421,7 @@ def preprocess_text(inputs, remove_space=True, lower=False):
"""Preprocesses data by removing extra space and normalize data.
This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/
master
/albert/tokenization.py
https://github.com/google-research/google-research/blob/
e1f6fa00
/albert/tokenization.py
Args:
inputs: The input text.
...
...
@@ -454,7 +454,7 @@ def encode_pieces(sp_model, text, sample=False):
"""Segements text into pieces.
This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/
master
/albert/tokenization.py
https://github.com/google-research/google-research/blob/
e1f6fa00
/albert/tokenization.py
Args:
...
...
@@ -496,7 +496,7 @@ def encode_ids(sp_model, text, sample=False):
"""Segments text and return token ids.
This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/
master
/albert/tokenization.py
https://github.com/google-research/google-research/blob/
e1f6fa00
/albert/tokenization.py
Args:
sp_model: A spm.SentencePieceProcessor object.
...
...
official/nlp/configs/encoders.py
View file @
b0ccdb11
...
...
@@ -26,8 +26,9 @@ import tensorflow as tf
from
official.modeling
import
hyperparams
from
official.modeling
import
tf_utils
from
official.nlp
.modeling
import
layers
from
official.nlp
import
keras_nlp
from
official.nlp.modeling
import
networks
from
official.nlp.projects.bigbird
import
encoder
as
bigbird_encoder
@
dataclasses
.
dataclass
...
...
@@ -60,18 +61,18 @@ class MobileBertEncoderConfig(hyperparams.Config):
num_blocks: number of transformer block in the encoder model.
hidden_size: the hidden size for the transformer block.
num_attention_heads: number of attention heads in the transformer block.
intermediate_size: the size of the "intermediate" (a.k.a., feed
forward)
layer.
intermediate_act_fn: the non-linear activation function to apply
to the
output of the intermediate/feed-forward layer.
intermediate_size: the size of the "intermediate" (a.k.a., feed
forward)
layer.
intermediate_act_fn: the non-linear activation function to apply
to the
output of the intermediate/feed-forward layer.
hidden_dropout_prob: dropout probability for the hidden layers.
attention_probs_dropout_prob: dropout probability of the attention
probabilities.
intra_bottleneck_size: the size of bottleneck.
initializer_range: The stddev of the truncated_normal_initializer for
initializing all weight matrices.
key_query_shared_bottleneck: whether to share linear transformation for
keys
and queries.
initializing all weight matrices.
key_query_shared_bottleneck: whether to share linear transformation for
keys
and queries.
num_feedforward_networks: number of stacked feed-forward networks.
normalization_type: the type of normalization_type, only 'no_norm' and
'layer_norm' are supported. 'no_norm' represents the element-wise linear
...
...
@@ -79,8 +80,6 @@ class MobileBertEncoderConfig(hyperparams.Config):
MobileBERT paper. 'layer_norm' is used for the teacher model.
classifier_activation: if using the tanh activation for the final
representation of the [CLS] token in fine-tuning.
return_all_layers: if return all layer outputs.
return_attention_score: if return attention scores for each layer.
"""
word_vocab_size
:
int
=
30522
word_embed_size
:
int
=
128
...
...
@@ -99,8 +98,6 @@ class MobileBertEncoderConfig(hyperparams.Config):
num_feedforward_networks
:
int
=
1
normalization_type
:
str
=
"layer_norm"
classifier_activation
:
bool
=
True
return_all_layers
:
bool
=
False
return_attention_score
:
bool
=
False
@
dataclasses
.
dataclass
...
...
@@ -120,27 +117,49 @@ class AlbertEncoderConfig(hyperparams.Config):
initializer_range
:
float
=
0.02
@
dataclasses
.
dataclass
class
BigBirdEncoderConfig
(
hyperparams
.
Config
):
"""BigBird encoder configuration."""
vocab_size
:
int
=
50358
hidden_size
:
int
=
768
num_layers
:
int
=
12
num_attention_heads
:
int
=
12
hidden_activation
:
str
=
"gelu"
intermediate_size
:
int
=
3072
dropout_rate
:
float
=
0.1
attention_dropout_rate
:
float
=
0.1
max_position_embeddings
:
int
=
4096
num_rand_blocks
:
int
=
3
block_size
:
int
=
64
type_vocab_size
:
int
=
16
initializer_range
:
float
=
0.02
embedding_size
:
Optional
[
int
]
=
None
@
dataclasses
.
dataclass
class
EncoderConfig
(
hyperparams
.
OneOfConfig
):
"""Encoder configuration."""
type
:
Optional
[
str
]
=
"bert"
albert
:
AlbertEncoderConfig
=
AlbertEncoderConfig
()
bert
:
BertEncoderConfig
=
BertEncoderConfig
()
bigbird
:
BigBirdEncoderConfig
=
BigBirdEncoderConfig
()
mobilebert
:
MobileBertEncoderConfig
=
MobileBertEncoderConfig
()
ENCODER_CLS
=
{
"bert"
:
networks
.
BertEncoder
,
"mobilebert"
:
networks
.
MobileBERTEncoder
,
"albert"
:
networks
.
AlbertTransformerEncoder
,
"albert"
:
networks
.
AlbertEncoder
,
"bigbird"
:
bigbird_encoder
.
BigBirdEncoder
,
}
@
gin
.
configurable
def
build_encoder
(
config
:
EncoderConfig
,
embedding_layer
:
Optional
[
layers
.
OnDeviceEmbedding
]
=
None
,
encoder_cls
=
None
,
bypass_config
:
bool
=
False
):
def
build_encoder
(
config
:
EncoderConfig
,
embedding_layer
:
Optional
[
keras_nlp
.
layers
.
OnDeviceEmbedding
]
=
None
,
encoder_cls
=
None
,
bypass_config
:
bool
=
False
):
"""Instantiate a Transformer encoder network from EncoderConfig.
Args:
...
...
@@ -188,7 +207,8 @@ def build_encoder(config: EncoderConfig,
pooled_output_dim
=
encoder_cfg
.
hidden_size
,
pooler_layer_initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
encoder_cfg
.
initializer_range
),
return_all_layer_outputs
=
encoder_cfg
.
return_all_encoder_outputs
)
return_all_layer_outputs
=
encoder_cfg
.
return_all_encoder_outputs
,
dict_outputs
=
True
)
return
encoder_cls
(
**
kwargs
)
if
encoder_type
==
"mobilebert"
:
...
...
@@ -205,12 +225,11 @@ def build_encoder(config: EncoderConfig,
hidden_dropout_prob
=
encoder_cfg
.
hidden_dropout_prob
,
attention_probs_dropout_prob
=
encoder_cfg
.
attention_probs_dropout_prob
,
intra_bottleneck_size
=
encoder_cfg
.
intra_bottleneck_size
,
initializer_range
=
encoder_cfg
.
initializer_range
,
key_query_shared_bottleneck
=
encoder_cfg
.
key_query_shared_bottleneck
,
num_feedforward_networks
=
encoder_cfg
.
num_feedforward_networks
,
normalization_type
=
encoder_cfg
.
normalization_type
,
classifier_activation
=
encoder_cfg
.
classifier_activation
,
return_all_layers
=
encoder_cfg
.
return_all_layers
,
return_attention_score
=
encoder_cfg
.
return_attention_score
)
classifier_activation
=
encoder_cfg
.
classifier_activation
)
if
encoder_type
==
"albert"
:
return
encoder_cls
(
...
...
@@ -226,7 +245,26 @@ def build_encoder(config: EncoderConfig,
dropout_rate
=
encoder_cfg
.
dropout_rate
,
attention_dropout_rate
=
encoder_cfg
.
attention_dropout_rate
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
encoder_cfg
.
initializer_range
))
stddev
=
encoder_cfg
.
initializer_range
),
dict_outputs
=
True
)
if
encoder_type
==
"bigbird"
:
return
encoder_cls
(
vocab_size
=
encoder_cfg
.
vocab_size
,
hidden_size
=
encoder_cfg
.
hidden_size
,
num_layers
=
encoder_cfg
.
num_layers
,
num_attention_heads
=
encoder_cfg
.
num_attention_heads
,
intermediate_size
=
encoder_cfg
.
intermediate_size
,
activation
=
tf_utils
.
get_activation
(
encoder_cfg
.
hidden_activation
),
dropout_rate
=
encoder_cfg
.
dropout_rate
,
attention_dropout_rate
=
encoder_cfg
.
attention_dropout_rate
,
num_rand_blocks
=
encoder_cfg
.
num_rand_blocks
,
block_size
=
encoder_cfg
.
block_size
,
max_sequence_length
=
encoder_cfg
.
max_position_embeddings
,
type_vocab_size
=
encoder_cfg
.
type_vocab_size
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
encoder_cfg
.
initializer_range
),
embedding_width
=
encoder_cfg
.
embedding_size
)
# Uses the default BERTEncoder configuration schema to create the encoder.
# If it does not match, please add a switch branch by the encoder type.
...
...
@@ -245,4 +283,5 @@ def build_encoder(config: EncoderConfig,
stddev
=
encoder_cfg
.
initializer_range
),
embedding_width
=
encoder_cfg
.
embedding_size
,
embedding_layer
=
embedding_layer
,
return_all_encoder_outputs
=
encoder_cfg
.
return_all_encoder_outputs
)
return_all_encoder_outputs
=
encoder_cfg
.
return_all_encoder_outputs
,
dict_outputs
=
True
)
official/nlp/keras_nlp/README.md
0 → 100644
View file @
b0ccdb11
# keras-nlp
## Layers
Layers are the fundamental building blocks for NLP models. They can be used to
assemble new layers, networks, or models.
*
[
TransformerEncoderBlock
](
layers/transformer_encoder_block.py
)
implements
an optionally masked transformer as described in
[
"Attention Is All You Need"
](
https://arxiv.org/abs/1706.03762
)
.
*
[
OnDeviceEmbedding
](
layers/on_device_embedding.py
)
implements efficient
embedding lookups designed for TPU-based models.
*
[
PositionalEmbedding
](
layers/position_embedding.py
)
creates a positional
embedding as described in
[
"BERT: Pre-training of Deep Bidirectional
Transformers for Language Understanding"
](
https://arxiv.org/abs/1810.04805
)
.
*
[
SelfAttentionMask
](
layers/self_attention_mask.py
)
creates a 3D attention
mask from a 2D tensor mask.
*
[
MaskedLM
](
layers/masked_lm.py
)
implements a masked language model. It
assumes the embedding table variable is passed to it.
## Encoders
Encoders are combinations of layers (and possibly other encoders). They are
sub-units of models that would not be trained alone. It encapsulates common
network structures like a classification head or a transformer encoder into an
easily handled object with a standardized configuration.
*
[
BertEncoder
](
encoders/bert_encoder.py
)
implements a bi-directional
Transformer-based encoder as described in
[
"BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding"
](
https://arxiv.org/abs/1810.04805
)
. It includes the embedding
lookups, transformer layers and pooling layer.
official/nlp/keras_nlp/__init__.py
View file @
b0ccdb11
...
...
@@ -14,4 +14,5 @@
# ==============================================================================
"""Keras-NLP package definition."""
# pylint: disable=wildcard-import
from
official.nlp.keras_nlp.layers
import
*
from
official.nlp.keras_nlp
import
encoders
from
official.nlp.keras_nlp
import
layers
official/nlp/keras_nlp/contributing.md
0 → 100644
View file @
b0ccdb11
## Contributing to KerasNLP
Patches to KerasNLP are welcome!
The source-of-truth repository lives under
[
TF Model Garden NLP
](
https://github.com/tensorflow/models/tree/master/official/nlp/keras_nlp
)
,
and is mirrored as a read-only repository under
[
keras-team/keras-nlp
](
https://github.com/keras-team/keras-nlp
)
.
Contributions should be made as PRs to the TF Model Garden repository.
This is to ensure the codebase is rigorously tested with state-of-art models
on different accelerators.
In the long run, we will move development to the current repository
`keras-team/keras-nlp`
.
## :heavy_check_mark: Contributor checklist
1.
Ensure you have signed the
[
Contributor License Agreement
](
https://cla.developers.google.com/about/google-individual?csw=1
)
.
*
All code contributors are required to sign a Contributor License Agreement.
*
Please read this
[
troubleshooting guide
](
Contributor-License-Agreements#troubleshooting-clas
)
if you encounter an issue.
2.
Please review the
[
contribution guidelines
](
https://github.com/tensorflow/models/wiki/How-to-contribute
)
.
3.
Check if your changes are consistent with the
[
TensorFlow coding style
](
https://www.tensorflow.org/community/contribute/code_style
)
.
official/nlp/keras_nlp/encoders/__init__.py
0 → 100644
View file @
b0ccdb11
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras-NLP layers package definition."""
from
official.nlp.keras_nlp.encoders.bert_encoder
import
BertEncoder
official/nlp/keras_nlp/encoders/bert_encoder.py
0 → 100644
View file @
b0ccdb11
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Bert encoder network."""
# pylint: disable=g-classes-have-attributes
import
tensorflow
as
tf
from
official.nlp.keras_nlp
import
layers
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'keras_nlp'
)
class
BertEncoder
(
tf
.
keras
.
Model
):
"""Bi-directional Transformer-based encoder network.
This network implements a bi-directional Transformer-based encoder as
described in "BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
embedding lookups and transformer layers, but not the masked language model
or classification task networks.
The default values for this object are taken from the BERT-Base implementation
in "BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding".
*Note* that the network is constructed by
[Keras Functional API](https://keras.io/guides/functional_api/).
Arguments:
vocab_size: The size of the token vocabulary.
hidden_size: The size of the transformer hidden layers.
num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
type_vocab_size: The number of types that the 'type_ids' input can take.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network for each transformer.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network for each transformer.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: The dropout rate to use for the attention layers
within the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yeilds the full
output.
embedding_width: The width of the word embeddings. If the embedding width is
not equal to hidden size, embedding parameters will be factorized into two
matrices in the shape of ['vocab_size', 'embedding_width'] and
['embedding_width', 'hidden_size'] ('embedding_width' is usually much
smaller than 'hidden_size').
"""
def
__init__
(
self
,
vocab_size
,
hidden_size
=
768
,
num_layers
=
12
,
num_attention_heads
=
12
,
max_sequence_length
=
512
,
type_vocab_size
=
16
,
inner_dim
=
3072
,
inner_activation
=
lambda
x
:
tf
.
keras
.
activations
.
gelu
(
x
,
approximate
=
True
),
output_dropout
=
0.1
,
attention_dropout
=
0.1
,
initializer
=
tf
.
keras
.
initializers
.
TruncatedNormal
(
stddev
=
0.02
),
output_range
=
None
,
embedding_width
=
None
,
**
kwargs
):
activation
=
tf
.
keras
.
activations
.
get
(
inner_activation
)
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
self
.
_self_setattr_tracking
=
False
self
.
_config_dict
=
{
'vocab_size'
:
vocab_size
,
'hidden_size'
:
hidden_size
,
'num_layers'
:
num_layers
,
'num_attention_heads'
:
num_attention_heads
,
'max_sequence_length'
:
max_sequence_length
,
'type_vocab_size'
:
type_vocab_size
,
'inner_dim'
:
inner_dim
,
'inner_activation'
:
tf
.
keras
.
activations
.
serialize
(
activation
),
'output_dropout'
:
output_dropout
,
'attention_dropout'
:
attention_dropout
,
'initializer'
:
tf
.
keras
.
initializers
.
serialize
(
initializer
),
'output_range'
:
output_range
,
'embedding_width'
:
embedding_width
,
}
word_ids
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
'input_word_ids'
)
mask
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
'input_mask'
)
type_ids
=
tf
.
keras
.
layers
.
Input
(
shape
=
(
None
,),
dtype
=
tf
.
int32
,
name
=
'input_type_ids'
)
if
embedding_width
is
None
:
embedding_width
=
hidden_size
self
.
_embedding_layer
=
self
.
_build_embedding_layer
()
word_embeddings
=
self
.
_embedding_layer
(
word_ids
)
# Always uses dynamic slicing for simplicity.
self
.
_position_embedding_layer
=
layers
.
PositionEmbedding
(
initializer
=
initializer
,
max_length
=
max_sequence_length
,
name
=
'position_embedding'
)
position_embeddings
=
self
.
_position_embedding_layer
(
word_embeddings
)
self
.
_type_embedding_layer
=
layers
.
OnDeviceEmbedding
(
vocab_size
=
type_vocab_size
,
embedding_width
=
embedding_width
,
initializer
=
initializer
,
use_one_hot
=
True
,
name
=
'type_embeddings'
)
type_embeddings
=
self
.
_type_embedding_layer
(
type_ids
)
embeddings
=
tf
.
keras
.
layers
.
Add
()(
[
word_embeddings
,
position_embeddings
,
type_embeddings
])
self
.
_embedding_norm_layer
=
tf
.
keras
.
layers
.
LayerNormalization
(
name
=
'embeddings/layer_norm'
,
axis
=-
1
,
epsilon
=
1e-12
,
dtype
=
tf
.
float32
)
embeddings
=
self
.
_embedding_norm_layer
(
embeddings
)
embeddings
=
(
tf
.
keras
.
layers
.
Dropout
(
rate
=
output_dropout
)(
embeddings
))
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
if
embedding_width
!=
hidden_size
:
self
.
_embedding_projection
=
tf
.
keras
.
layers
.
experimental
.
EinsumDense
(
'...x,xy->...y'
,
output_shape
=
hidden_size
,
bias_axes
=
'y'
,
kernel_initializer
=
initializer
,
name
=
'embedding_projection'
)
embeddings
=
self
.
_embedding_projection
(
embeddings
)
self
.
_transformer_layers
=
[]
data
=
embeddings
attention_mask
=
layers
.
SelfAttentionMask
()(
data
,
mask
)
encoder_outputs
=
[]
for
i
in
range
(
num_layers
):
if
i
==
num_layers
-
1
and
output_range
is
not
None
:
transformer_output_range
=
output_range
else
:
transformer_output_range
=
None
layer
=
layers
.
TransformerEncoderBlock
(
num_attention_heads
=
num_attention_heads
,
inner_dim
=
inner_dim
,
inner_activation
=
inner_activation
,
output_dropout
=
output_dropout
,
attention_dropout
=
attention_dropout
,
output_range
=
transformer_output_range
,
kernel_initializer
=
initializer
,
name
=
'transformer/layer_%d'
%
i
)
self
.
_transformer_layers
.
append
(
layer
)
data
=
layer
([
data
,
attention_mask
])
encoder_outputs
.
append
(
data
)
first_token_tensor
=
(
tf
.
keras
.
layers
.
Lambda
(
lambda
x
:
tf
.
squeeze
(
x
[:,
0
:
1
,
:],
axis
=
1
))(
encoder_outputs
[
-
1
]))
self
.
_pooler_layer
=
tf
.
keras
.
layers
.
Dense
(
units
=
hidden_size
,
activation
=
'tanh'
,
kernel_initializer
=
initializer
,
name
=
'pooler_transform'
)
cls_output
=
self
.
_pooler_layer
(
first_token_tensor
)
outputs
=
dict
(
sequence_output
=
encoder_outputs
[
-
1
],
pooled_output
=
cls_output
,
encoder_outputs
=
encoder_outputs
,
)
super
(
BertEncoder
,
self
).
__init__
(
inputs
=
[
word_ids
,
mask
,
type_ids
],
outputs
=
outputs
,
**
kwargs
)
def
get_embedding_table
(
self
):
return
self
.
_embedding_layer
.
embeddings
def
_build_embedding_layer
(
self
):
embedding_width
=
self
.
_config_dict
[
'embedding_width'
]
or
self
.
_config_dict
[
'hidden_size'
]
return
layers
.
OnDeviceEmbedding
(
vocab_size
=
self
.
_config_dict
[
'vocab_size'
],
embedding_width
=
embedding_width
,
initializer
=
self
.
_config_dict
[
'initializer'
],
name
=
'word_embeddings'
)
def
get_embedding_layer
(
self
):
return
self
.
_embedding_layer
def
get_config
(
self
):
return
self
.
_config_dict
@
property
def
transformer_layers
(
self
):
"""List of Transformer layers in the encoder."""
return
self
.
_transformer_layers
@
property
def
pooler_layer
(
self
):
"""The pooler dense layer after the transformer layers."""
return
self
.
_pooler_layer
@
classmethod
def
from_config
(
cls
,
config
,
custom_objects
=
None
):
return
cls
(
**
config
)
official/nlp/keras_nlp/encoders/bert_encoder_test.py
0 → 100644
View file @
b0ccdb11
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for transformer-based bert encoder network."""
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.keras_nlp.encoders
import
bert_encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@
keras_parameterized
.
run_all_keras_modes
class
BertEncoderTest
(
keras_parameterized
.
TestCase
):
def
tearDown
(
self
):
super
(
BertEncoderTest
,
self
).
tearDown
()
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
"float32"
)
def
test_network_creation
(
self
):
hidden_size
=
32
sequence_length
=
21
# Create a small BertEncoder for testing.
test_network
=
bert_encoder
.
BertEncoder
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
self
.
assertIsInstance
(
test_network
.
transformer_layers
,
list
)
self
.
assertLen
(
test_network
.
transformer_layers
,
3
)
self
.
assertIsInstance
(
test_network
.
pooler_layer
,
tf
.
keras
.
layers
.
Dense
)
expected_data_shape
=
[
None
,
sequence_length
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# The default output dtype is float32.
self
.
assertAllEqual
(
tf
.
float32
,
data
.
dtype
)
self
.
assertAllEqual
(
tf
.
float32
,
pooled
.
dtype
)
def
test_all_encoder_outputs_network_creation
(
self
):
hidden_size
=
32
sequence_length
=
21
# Create a small BertEncoder for testing.
test_network
=
bert_encoder
.
BertEncoder
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
all_encoder_outputs
=
dict_outputs
[
"encoder_outputs"
]
pooled
=
dict_outputs
[
"pooled_output"
]
expected_data_shape
=
[
None
,
sequence_length
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertLen
(
all_encoder_outputs
,
3
)
for
data
in
all_encoder_outputs
:
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# The default output dtype is float32.
self
.
assertAllEqual
(
tf
.
float32
,
all_encoder_outputs
[
-
1
].
dtype
)
self
.
assertAllEqual
(
tf
.
float32
,
pooled
.
dtype
)
def
test_network_creation_with_float16_dtype
(
self
):
hidden_size
=
32
sequence_length
=
21
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
"mixed_float16"
)
# Create a small BertEncoder for testing.
test_network
=
bert_encoder
.
BertEncoder
(
vocab_size
=
100
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
expected_data_shape
=
[
None
,
sequence_length
,
hidden_size
]
expected_pooled_shape
=
[
None
,
hidden_size
]
self
.
assertAllEqual
(
expected_data_shape
,
data
.
shape
.
as_list
())
self
.
assertAllEqual
(
expected_pooled_shape
,
pooled
.
shape
.
as_list
())
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self
.
assertAllEqual
(
tf
.
float32
,
data
.
dtype
)
self
.
assertAllEqual
(
tf
.
float16
,
pooled
.
dtype
)
@
parameterized
.
named_parameters
(
(
"all_sequence"
,
None
,
21
),
(
"output_range"
,
1
,
1
),
)
def
test_network_invocation
(
self
,
output_range
,
out_seq_len
):
hidden_size
=
32
sequence_length
=
21
vocab_size
=
57
num_types
=
7
# Create a small BertEncoder for testing.
test_network
=
bert_encoder
.
BertEncoder
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
,
output_range
=
output_range
)
# Create the inputs (note that the first dimension is implicit).
word_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
mask
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
type_ids
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
,),
dtype
=
tf
.
int32
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
# Create a model based off of this network:
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
# Invoke the model. We can't validate the output data here (the model is too
# complex) but this will catch structural runtime errors.
batch_size
=
3
word_id_data
=
np
.
random
.
randint
(
vocab_size
,
size
=
(
batch_size
,
sequence_length
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
))
type_id_data
=
np
.
random
.
randint
(
num_types
,
size
=
(
batch_size
,
sequence_length
))
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
1
],
out_seq_len
)
# Creates a BertEncoder with max_sequence_length != sequence_length
max_sequence_length
=
128
test_network
=
bert_encoder
.
BertEncoder
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
max_sequence_length
=
max_sequence_length
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
1
],
sequence_length
)
# Creates a BertEncoder with embedding_width != hidden_size
test_network
=
bert_encoder
.
BertEncoder
(
vocab_size
=
vocab_size
,
hidden_size
=
hidden_size
,
max_sequence_length
=
max_sequence_length
,
num_attention_heads
=
2
,
num_layers
=
3
,
type_vocab_size
=
num_types
,
embedding_width
=
16
)
dict_outputs
=
test_network
([
word_ids
,
mask
,
type_ids
])
data
=
dict_outputs
[
"sequence_output"
]
pooled
=
dict_outputs
[
"pooled_output"
]
model
=
tf
.
keras
.
Model
([
word_ids
,
mask
,
type_ids
],
[
data
,
pooled
])
outputs
=
model
.
predict
([
word_id_data
,
mask_data
,
type_id_data
])
self
.
assertEqual
(
outputs
[
0
].
shape
[
-
1
],
hidden_size
)
self
.
assertTrue
(
hasattr
(
test_network
,
"_embedding_projection"
))
def
test_serialize_deserialize
(
self
):
# Create a network object that sets all of its config options.
kwargs
=
dict
(
vocab_size
=
100
,
hidden_size
=
32
,
num_layers
=
3
,
num_attention_heads
=
2
,
max_sequence_length
=
21
,
type_vocab_size
=
12
,
inner_dim
=
1223
,
inner_activation
=
"relu"
,
output_dropout
=
0.05
,
attention_dropout
=
0.22
,
initializer
=
"glorot_uniform"
,
output_range
=-
1
,
embedding_width
=
16
)
network
=
bert_encoder
.
BertEncoder
(
**
kwargs
)
expected_config
=
dict
(
kwargs
)
expected_config
[
"inner_activation"
]
=
tf
.
keras
.
activations
.
serialize
(
tf
.
keras
.
activations
.
get
(
expected_config
[
"inner_activation"
]))
expected_config
[
"initializer"
]
=
tf
.
keras
.
initializers
.
serialize
(
tf
.
keras
.
initializers
.
get
(
expected_config
[
"initializer"
]))
self
.
assertEqual
(
network
.
get_config
(),
expected_config
)
# Create another network object from the first object's config.
new_network
=
bert_encoder
.
BertEncoder
.
from_config
(
network
.
get_config
())
# Validate that the config can be forced to JSON.
_
=
network
.
to_json
()
# If the serialization was successful, the new config should match the old.
self
.
assertAllEqual
(
network
.
get_config
(),
new_network
.
get_config
())
# Tests model saving/loading.
model_path
=
self
.
get_temp_dir
()
+
"/model"
network
.
save
(
model_path
)
_
=
tf
.
keras
.
models
.
load_model
(
model_path
)
if
__name__
==
"__main__"
:
tf
.
test
.
main
()
official/nlp/keras_nlp/layers/__init__.py
View file @
b0ccdb11
...
...
@@ -13,6 +13,8 @@
# limitations under the License.
# ==============================================================================
"""Keras-NLP layers package definition."""
from
official.nlp.keras_nlp.layers.masked_lm
import
MaskedLM
from
official.nlp.keras_nlp.layers.on_device_embedding
import
OnDeviceEmbedding
from
official.nlp.keras_nlp.layers.position_embedding
import
PositionEmbedding
from
official.nlp.keras_nlp.layers.self_attention_mask
import
SelfAttentionMask
from
official.nlp.keras_nlp.layers.transformer_encoder_block
import
TransformerEncoderBlock
official/nlp/keras_nlp/layers/masked_lm.py
0 → 100644
View file @
b0ccdb11
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Masked language model network."""
# pylint: disable=g-classes-have-attributes
import
tensorflow
as
tf
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
'keras_nlp'
)
class
MaskedLM
(
tf
.
keras
.
layers
.
Layer
):
"""Masked language model network head for BERT modeling.
This layer implements a masked language model based on the provided
transformer based encoder. It assumes that the encoder network being passed
has a "get_embedding_table()" method.
Example:
```python
encoder=keras_nlp.BertEncoder(...)
lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
```
Arguments:
embedding_table: The embedding table from encoder network.
activation: The activation, if any, for the dense layer.
initializer: The initializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this layer. Can be either 'logits' or
'predictions'.
"""
def
__init__
(
self
,
embedding_table
,
activation
=
None
,
initializer
=
'glorot_uniform'
,
output
=
'logits'
,
name
=
None
,
**
kwargs
):
super
(
MaskedLM
,
self
).
__init__
(
name
=
name
,
**
kwargs
)
self
.
embedding_table
=
embedding_table
self
.
activation
=
activation
self
.
initializer
=
tf
.
keras
.
initializers
.
get
(
initializer
)
if
output
not
in
(
'predictions'
,
'logits'
):
raise
ValueError
(
(
'Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"'
)
%
output
)
self
.
_output_type
=
output
def
build
(
self
,
input_shape
):
self
.
_vocab_size
,
hidden_size
=
self
.
embedding_table
.
shape
self
.
dense
=
tf
.
keras
.
layers
.
Dense
(
hidden_size
,
activation
=
self
.
activation
,
kernel_initializer
=
self
.
initializer
,
name
=
'transform/dense'
)
self
.
layer_norm
=
tf
.
keras
.
layers
.
LayerNormalization
(
axis
=-
1
,
epsilon
=
1e-12
,
name
=
'transform/LayerNorm'
)
self
.
bias
=
self
.
add_weight
(
'output_bias/bias'
,
shape
=
(
self
.
_vocab_size
,),
initializer
=
'zeros'
,
trainable
=
True
)
super
(
MaskedLM
,
self
).
build
(
input_shape
)
def
call
(
self
,
sequence_data
,
masked_positions
):
masked_lm_input
=
self
.
_gather_indexes
(
sequence_data
,
masked_positions
)
lm_data
=
self
.
dense
(
masked_lm_input
)
lm_data
=
self
.
layer_norm
(
lm_data
)
lm_data
=
tf
.
matmul
(
lm_data
,
self
.
embedding_table
,
transpose_b
=
True
)
logits
=
tf
.
nn
.
bias_add
(
lm_data
,
self
.
bias
)
masked_positions_length
=
masked_positions
.
shape
[
1
]
or
tf
.
shape
(
masked_positions
)[
1
]
logits
=
tf
.
reshape
(
logits
,
[
-
1
,
masked_positions_length
,
self
.
_vocab_size
])
if
self
.
_output_type
==
'logits'
:
return
logits
return
tf
.
nn
.
log_softmax
(
logits
)
def
get_config
(
self
):
raise
NotImplementedError
(
'MaskedLM cannot be directly serialized because '
'it has variable sharing logic.'
)
def
_gather_indexes
(
self
,
sequence_tensor
,
positions
):
"""Gathers the vectors at the specific positions.
Args:
sequence_tensor: Sequence output of `BertModel` layer of shape
(`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
hidden units of `BertModel` layer.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension (batch_size, num_predictions) where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape (batch_size * num_predictions,
num_hidden).
"""
sequence_shape
=
tf
.
shape
(
sequence_tensor
)
batch_size
,
seq_length
=
sequence_shape
[
0
],
sequence_shape
[
1
]
width
=
sequence_tensor
.
shape
.
as_list
()[
2
]
or
sequence_shape
[
2
]
flat_offsets
=
tf
.
reshape
(
tf
.
range
(
0
,
batch_size
,
dtype
=
tf
.
int32
)
*
seq_length
,
[
-
1
,
1
])
flat_positions
=
tf
.
reshape
(
positions
+
flat_offsets
,
[
-
1
])
flat_sequence_tensor
=
tf
.
reshape
(
sequence_tensor
,
[
batch_size
*
seq_length
,
width
])
output_tensor
=
tf
.
gather
(
flat_sequence_tensor
,
flat_positions
)
return
output_tensor
official/nlp/keras_nlp/layers/on_device_embedding.py
0 → 100644
View file @
b0ccdb11
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras-based one-hot embedding layer."""
# pylint: disable=g-classes-have-attributes
import
tensorflow
as
tf
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"keras_nlp"
)
class
OnDeviceEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Performs an embedding lookup suitable for accelerator devices.
This layer uses either tf.gather or tf.one_hot to translate integer indices to
float embeddings.
Arguments:
vocab_size: Number of elements in the vocabulary.
embedding_width: Output size of the embedding layer.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
lookup. Defaults to False (that is, using tf.gather). Setting this option
to True may improve performance, especially on small vocabulary sizes, but
will generally require more memory.
scale_factor: Whether to scale the output embeddings. Defaults to None (that
is, not to scale). Setting this option to a float will let values in
output embeddings multiplied by scale_factor.
"""
def
__init__
(
self
,
vocab_size
,
embedding_width
,
initializer
=
"glorot_uniform"
,
use_one_hot
=
False
,
scale_factor
=
None
,
**
kwargs
):
super
(
OnDeviceEmbedding
,
self
).
__init__
(
**
kwargs
)
self
.
_vocab_size
=
vocab_size
self
.
_embedding_width
=
embedding_width
self
.
_initializer
=
initializer
self
.
_use_one_hot
=
use_one_hot
self
.
_scale_factor
=
scale_factor
def
get_config
(
self
):
config
=
{
"vocab_size"
:
self
.
_vocab_size
,
"embedding_width"
:
self
.
_embedding_width
,
"initializer"
:
self
.
_initializer
,
"use_one_hot"
:
self
.
_use_one_hot
,
"scale_factor"
:
self
.
_scale_factor
,
}
base_config
=
super
(
OnDeviceEmbedding
,
self
).
get_config
()
return
dict
(
list
(
base_config
.
items
())
+
list
(
config
.
items
()))
def
build
(
self
,
input_shape
):
self
.
embeddings
=
self
.
add_weight
(
"embeddings"
,
shape
=
[
self
.
_vocab_size
,
self
.
_embedding_width
],
initializer
=
self
.
_initializer
,
dtype
=
tf
.
float32
)
super
(
OnDeviceEmbedding
,
self
).
build
(
input_shape
)
def
call
(
self
,
inputs
):
flat_inputs
=
tf
.
reshape
(
inputs
,
[
-
1
])
if
self
.
_use_one_hot
:
one_hot_data
=
tf
.
one_hot
(
flat_inputs
,
depth
=
self
.
_vocab_size
,
dtype
=
self
.
embeddings
.
dtype
)
embeddings
=
tf
.
matmul
(
one_hot_data
,
self
.
embeddings
)
else
:
embeddings
=
tf
.
gather
(
self
.
embeddings
,
flat_inputs
)
embeddings
=
tf
.
reshape
(
embeddings
,
# Work around b/142213824: prefer concat to shape over a Python list.
tf
.
concat
([
tf
.
shape
(
inputs
),
[
self
.
_embedding_width
]],
axis
=
0
))
embeddings
.
set_shape
(
inputs
.
shape
.
as_list
()
+
[
self
.
_embedding_width
])
if
self
.
_scale_factor
:
embeddings
*=
self
.
_scale_factor
return
embeddings
official/nlp/
modeling
/layers/on_device_embedding_test.py
→
official/nlp/
keras_nlp
/layers/on_device_embedding_test.py
View file @
b0ccdb11
...
...
@@ -18,7 +18,7 @@ import numpy as np
import
tensorflow
as
tf
from
tensorflow.python.keras
import
keras_parameterized
# pylint: disable=g-direct-tensorflow-import
from
official.nlp.
modeling
.layers
import
on_device_embedding
from
official.nlp.
keras_nlp
.layers
import
on_device_embedding
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
...
...
@@ -192,7 +192,8 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
vocab_size
=
31
embedding_width
=
27
test_layer
=
on_device_embedding
.
OnDeviceEmbedding
(
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
use_scale
=
True
)
vocab_size
=
vocab_size
,
embedding_width
=
embedding_width
,
scale_factor
=
embedding_width
**
0.5
)
# Create a 2-dimensional input (the first dimension is implicit).
sequence_length
=
23
input_tensor
=
tf
.
keras
.
Input
(
shape
=
(
sequence_length
),
dtype
=
tf
.
int32
)
...
...
official/nlp/keras_nlp/layers/position_embedding.py
View file @
b0ccdb11
...
...
@@ -17,7 +17,7 @@
import
tensorflow
as
tf
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"
Text
"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"
keras_nlp
"
)
class
PositionEmbedding
(
tf
.
keras
.
layers
.
Layer
):
"""Creates a positional embedding.
...
...
official/nlp/keras_nlp/layers/transformer_encoder_block.py
View file @
b0ccdb11
...
...
@@ -14,11 +14,10 @@
# ==============================================================================
"""Keras-based TransformerEncoder block layer."""
# Import libraries
import
tensorflow
as
tf
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"
Text
"
)
@
tf
.
keras
.
utils
.
register_keras_serializable
(
package
=
"
keras_nlp
"
)
class
TransformerEncoderBlock
(
tf
.
keras
.
layers
.
Layer
):
"""TransformerEncoderBlock layer.
...
...
@@ -241,6 +240,9 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
input_tensor
,
attention_mask
=
(
inputs
,
None
)
if
self
.
_output_range
:
if
self
.
_norm_first
:
source_tensor
=
input_tensor
[:,
0
:
self
.
_output_range
,
:]
input_tensor
=
self
.
_attention_layer_norm
(
input_tensor
)
target_tensor
=
input_tensor
[:,
0
:
self
.
_output_range
,
:]
attention_mask
=
attention_mask
[:,
0
:
self
.
_output_range
,
:]
else
:
...
...
official/nlp/keras_nlp/layers/transformer_encoder_block_test.py
View file @
b0ccdb11
...
...
@@ -14,11 +14,6 @@
# ==============================================================================
"""Tests for Keras-based transformer block layer."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
# Import libraries
from
absl.testing
import
parameterized
import
numpy
as
np
import
tensorflow
as
tf
...
...
@@ -142,6 +137,34 @@ class TransformerEncoderBlockLayerTest(keras_parameterized.TestCase):
self
.
assertAllClose
(
new_output_tensor
,
output_tensor
[:,
0
:
1
,
:],
atol
=
5e-5
,
rtol
=
0.003
)
def
test_layer_output_range_with_pre_norm
(
self
,
transformer_cls
):
test_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
norm_first
=
True
)
sequence_length
=
21
width
=
80
batch_size
=
6
input_data
=
10
*
np
.
random
.
random_sample
(
(
batch_size
,
sequence_length
,
width
))
mask_data
=
np
.
random
.
randint
(
2
,
size
=
(
batch_size
,
sequence_length
,
sequence_length
))
output_tensor
=
test_layer
([
input_data
,
mask_data
])
# The layer only attends to the first token and outputs the first token
# embeeding.
new_layer
=
transformer_cls
(
num_attention_heads
=
10
,
inner_dim
=
2048
,
inner_activation
=
'relu'
,
output_range
=
1
,
norm_first
=
True
)
_
=
new_layer
([
input_data
,
mask_data
])
new_layer
.
set_weights
(
test_layer
.
get_weights
())
new_output_tensor
=
new_layer
([
input_data
,
mask_data
])
self
.
assertAllClose
(
new_output_tensor
,
output_tensor
[:,
0
:
1
,
:],
atol
=
5e-5
,
rtol
=
0.003
)
def
test_layer_invocation_with_float16_dtype
(
self
,
transformer_cls
):
tf
.
keras
.
mixed_precision
.
experimental
.
set_policy
(
'mixed_float16'
)
test_layer
=
transformer_cls
(
...
...
Prev
1
2
3
4
5
6
…
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment