Commit b0ccdb11 authored by Shixin Luo's avatar Shixin Luo
Browse files

resolve conflict with master

parents e61588cd 1611a8c5
...@@ -25,8 +25,8 @@ import tempfile ...@@ -25,8 +25,8 @@ import tempfile
from absl import logging from absl import logging
import tensorflow as tf import tensorflow as tf
from tensorflow.python.util import deprecation from tensorflow.python.util import deprecation
from official.common import distribute_utils
from official.staging.training import grad_utils from official.staging.training import grad_utils
from official.utils.misc import distribution_utils
_SUMMARY_TXT = 'training_summary.txt' _SUMMARY_TXT = 'training_summary.txt'
_MIN_SUMMARY_STEPS = 10 _MIN_SUMMARY_STEPS = 10
...@@ -164,9 +164,9 @@ def run_customized_training_loop( ...@@ -164,9 +164,9 @@ def run_customized_training_loop(
evaluation is skipped. evaluation is skipped.
eval_steps: Number of steps to run evaluation. Required if `eval_input_fn` eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
is not none. is not none.
metric_fn: A metrics function that returns a Keras Metric object to record metric_fn: A metrics function that returns either a Keras Metric object or
evaluation result using evaluation dataset or with training dataset a list of Keras Metric objects to record evaluation result using
after every epoch. evaluation dataset or with training dataset after every epoch.
init_checkpoint: Optional checkpoint to load to `sub_model` returned by init_checkpoint: Optional checkpoint to load to `sub_model` returned by
`model_fn`. `model_fn`.
custom_callbacks: A list of Keras Callbacks objects to run during custom_callbacks: A list of Keras Callbacks objects to run during
...@@ -266,7 +266,7 @@ def run_customized_training_loop( ...@@ -266,7 +266,7 @@ def run_customized_training_loop(
train_iterator = _get_input_iterator(train_input_fn, strategy) train_iterator = _get_input_iterator(train_input_fn, strategy)
eval_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) eval_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
with distribution_utils.get_strategy_scope(strategy): with distribute_utils.get_strategy_scope(strategy):
# To correctly place the model weights on accelerators, # To correctly place the model weights on accelerators,
# model and optimizer should be created in scope. # model and optimizer should be created in scope.
model, sub_model = model_fn() model, sub_model = model_fn()
...@@ -291,7 +291,9 @@ def run_customized_training_loop( ...@@ -291,7 +291,9 @@ def run_customized_training_loop(
logging.info('Loading from checkpoint file completed') logging.info('Loading from checkpoint file completed')
train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32) train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
eval_metrics = [metric_fn()] if metric_fn else [] eval_metrics = metric_fn() if metric_fn else []
if not isinstance(eval_metrics, list):
eval_metrics = [eval_metrics]
# If evaluation is required, make a copy of metric as it will be used by # If evaluation is required, make a copy of metric as it will be used by
# both train and evaluation. # both train and evaluation.
train_metrics = [ train_metrics = [
......
...@@ -28,6 +28,7 @@ from absl import flags ...@@ -28,6 +28,7 @@ from absl import flags
from absl import logging from absl import logging
import gin import gin
import tensorflow as tf import tensorflow as tf
from official.common import distribute_utils
from official.modeling import performance from official.modeling import performance
from official.nlp import optimization from official.nlp import optimization
from official.nlp.bert import bert_models from official.nlp.bert import bert_models
...@@ -35,7 +36,6 @@ from official.nlp.bert import common_flags ...@@ -35,7 +36,6 @@ from official.nlp.bert import common_flags
from official.nlp.bert import configs as bert_configs from official.nlp.bert import configs as bert_configs
from official.nlp.bert import input_pipeline from official.nlp.bert import input_pipeline
from official.nlp.bert import model_saving_utils from official.nlp.bert import model_saving_utils
from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils from official.utils.misc import keras_utils
flags.DEFINE_enum( flags.DEFINE_enum(
...@@ -447,7 +447,7 @@ def custom_main(custom_callbacks=None, custom_metrics=None): ...@@ -447,7 +447,7 @@ def custom_main(custom_callbacks=None, custom_metrics=None):
FLAGS.model_dir) FLAGS.model_dir)
return return
strategy = distribution_utils.get_distribution_strategy( strategy = distribute_utils.get_distribution_strategy(
distribution_strategy=FLAGS.distribution_strategy, distribution_strategy=FLAGS.distribution_strategy,
num_gpus=FLAGS.num_gpus, num_gpus=FLAGS.num_gpus,
tpu_address=FLAGS.tpu) tpu_address=FLAGS.tpu)
......
...@@ -23,6 +23,7 @@ from absl import flags ...@@ -23,6 +23,7 @@ from absl import flags
from absl import logging from absl import logging
import gin import gin
import tensorflow as tf import tensorflow as tf
from official.common import distribute_utils
from official.modeling import performance from official.modeling import performance
from official.nlp import optimization from official.nlp import optimization
from official.nlp.bert import bert_models from official.nlp.bert import bert_models
...@@ -30,7 +31,6 @@ from official.nlp.bert import common_flags ...@@ -30,7 +31,6 @@ from official.nlp.bert import common_flags
from official.nlp.bert import configs from official.nlp.bert import configs
from official.nlp.bert import input_pipeline from official.nlp.bert import input_pipeline
from official.nlp.bert import model_training_utils from official.nlp.bert import model_training_utils
from official.utils.misc import distribution_utils
flags.DEFINE_string('input_files', None, flags.DEFINE_string('input_files', None,
...@@ -205,9 +205,8 @@ def main(_): ...@@ -205,9 +205,8 @@ def main(_):
FLAGS.model_dir = '/tmp/bert20/' FLAGS.model_dir = '/tmp/bert20/'
# Configures cluster spec for multi-worker distribution strategy. # Configures cluster spec for multi-worker distribution strategy.
if FLAGS.num_gpus > 0: if FLAGS.num_gpus > 0:
_ = distribution_utils.configure_cluster(FLAGS.worker_hosts, _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
FLAGS.task_index) strategy = distribute_utils.get_distribution_strategy(
strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=FLAGS.distribution_strategy, distribution_strategy=FLAGS.distribution_strategy,
num_gpus=FLAGS.num_gpus, num_gpus=FLAGS.num_gpus,
all_reduce_alg=FLAGS.all_reduce_alg, all_reduce_alg=FLAGS.all_reduce_alg,
......
...@@ -28,12 +28,11 @@ from absl import flags ...@@ -28,12 +28,11 @@ from absl import flags
from absl import logging from absl import logging
import gin import gin
import tensorflow as tf import tensorflow as tf
from official.common import distribute_utils
from official.nlp.bert import configs as bert_configs from official.nlp.bert import configs as bert_configs
from official.nlp.bert import run_squad_helper from official.nlp.bert import run_squad_helper
from official.nlp.bert import tokenization from official.nlp.bert import tokenization
from official.nlp.data import squad_lib as squad_lib_wp from official.nlp.data import squad_lib as squad_lib_wp
from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils from official.utils.misc import keras_utils
...@@ -105,9 +104,8 @@ def main(_): ...@@ -105,9 +104,8 @@ def main(_):
# Configures cluster spec for multi-worker distribution strategy. # Configures cluster spec for multi-worker distribution strategy.
if FLAGS.num_gpus > 0: if FLAGS.num_gpus > 0:
_ = distribution_utils.configure_cluster(FLAGS.worker_hosts, _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
FLAGS.task_index) strategy = distribute_utils.get_distribution_strategy(
strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=FLAGS.distribution_strategy, distribution_strategy=FLAGS.distribution_strategy,
num_gpus=FLAGS.num_gpus, num_gpus=FLAGS.num_gpus,
all_reduce_alg=FLAGS.all_reduce_alg, all_reduce_alg=FLAGS.all_reduce_alg,
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
"""A converter from a V1 BERT encoder checkpoint to a V2 encoder checkpoint. """A converter from a V1 BERT encoder checkpoint to a V2 encoder checkpoint.
The conversion will yield an object-oriented checkpoint that can be used The conversion will yield an object-oriented checkpoint that can be used
to restore a TransformerEncoder object. to restore a BertEncoder or BertPretrainerV2 object (see the `converted_model`
FLAG below).
""" """
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
...@@ -27,9 +28,10 @@ from absl import app ...@@ -27,9 +28,10 @@ from absl import app
from absl import flags from absl import flags
import tensorflow as tf import tensorflow as tf
from official.modeling import activations from official.modeling import tf_utils
from official.nlp.bert import configs from official.nlp.bert import configs
from official.nlp.bert import tf1_checkpoint_converter_lib from official.nlp.bert import tf1_checkpoint_converter_lib
from official.nlp.modeling import models
from official.nlp.modeling import networks from official.nlp.modeling import networks
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
...@@ -46,6 +48,10 @@ flags.DEFINE_string("checkpoint_model_name", "encoder", ...@@ -46,6 +48,10 @@ flags.DEFINE_string("checkpoint_model_name", "encoder",
"The name of the model when saving the checkpoint, i.e., " "The name of the model when saving the checkpoint, i.e., "
"the checkpoint will be saved using: " "the checkpoint will be saved using: "
"tf.train.Checkpoint(FLAGS.checkpoint_model_name=model).") "tf.train.Checkpoint(FLAGS.checkpoint_model_name=model).")
flags.DEFINE_enum(
"converted_model", "encoder", ["encoder", "pretrainer"],
"Whether to convert the checkpoint to a `BertEncoder` model or a "
"`BertPretrainerV2` model (with mlm but without classification heads).")
def _create_bert_model(cfg): def _create_bert_model(cfg):
...@@ -55,7 +61,7 @@ def _create_bert_model(cfg): ...@@ -55,7 +61,7 @@ def _create_bert_model(cfg):
cfg: A `BertConfig` to create the core model. cfg: A `BertConfig` to create the core model.
Returns: Returns:
A TransformerEncoder netowork. A BertEncoder network.
""" """
bert_encoder = networks.BertEncoder( bert_encoder = networks.BertEncoder(
vocab_size=cfg.vocab_size, vocab_size=cfg.vocab_size,
...@@ -63,7 +69,7 @@ def _create_bert_model(cfg): ...@@ -63,7 +69,7 @@ def _create_bert_model(cfg):
num_layers=cfg.num_hidden_layers, num_layers=cfg.num_hidden_layers,
num_attention_heads=cfg.num_attention_heads, num_attention_heads=cfg.num_attention_heads,
intermediate_size=cfg.intermediate_size, intermediate_size=cfg.intermediate_size,
activation=activations.gelu, activation=tf_utils.get_activation(cfg.hidden_act),
dropout_rate=cfg.hidden_dropout_prob, dropout_rate=cfg.hidden_dropout_prob,
attention_dropout_rate=cfg.attention_probs_dropout_prob, attention_dropout_rate=cfg.attention_probs_dropout_prob,
max_sequence_length=cfg.max_position_embeddings, max_sequence_length=cfg.max_position_embeddings,
...@@ -75,8 +81,29 @@ def _create_bert_model(cfg): ...@@ -75,8 +81,29 @@ def _create_bert_model(cfg):
return bert_encoder return bert_encoder
def convert_checkpoint(bert_config, output_path, v1_checkpoint, def _create_bert_pretrainer_model(cfg):
checkpoint_model_name="model"): """Creates a BERT keras core model from BERT configuration.
Args:
cfg: A `BertConfig` to create the core model.
Returns:
A BertPretrainerV2 model.
"""
bert_encoder = _create_bert_model(cfg)
pretrainer = models.BertPretrainerV2(
encoder_network=bert_encoder,
mlm_activation=tf_utils.get_activation(cfg.hidden_act),
mlm_initializer=tf.keras.initializers.TruncatedNormal(
stddev=cfg.initializer_range))
return pretrainer
def convert_checkpoint(bert_config,
output_path,
v1_checkpoint,
checkpoint_model_name="model",
converted_model="encoder"):
"""Converts a V1 checkpoint into an OO V2 checkpoint.""" """Converts a V1 checkpoint into an OO V2 checkpoint."""
output_dir, _ = os.path.split(output_path) output_dir, _ = os.path.split(output_path)
tf.io.gfile.makedirs(output_dir) tf.io.gfile.makedirs(output_dir)
...@@ -84,6 +111,7 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint, ...@@ -84,6 +111,7 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint,
# Create a temporary V1 name-converted checkpoint in the output directory. # Create a temporary V1 name-converted checkpoint in the output directory.
temporary_checkpoint_dir = os.path.join(output_dir, "temp_v1") temporary_checkpoint_dir = os.path.join(output_dir, "temp_v1")
temporary_checkpoint = os.path.join(temporary_checkpoint_dir, "ckpt") temporary_checkpoint = os.path.join(temporary_checkpoint_dir, "ckpt")
tf1_checkpoint_converter_lib.convert( tf1_checkpoint_converter_lib.convert(
checkpoint_from_path=v1_checkpoint, checkpoint_from_path=v1_checkpoint,
checkpoint_to_path=temporary_checkpoint, checkpoint_to_path=temporary_checkpoint,
...@@ -92,8 +120,14 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint, ...@@ -92,8 +120,14 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint,
permutations=tf1_checkpoint_converter_lib.BERT_V2_PERMUTATIONS, permutations=tf1_checkpoint_converter_lib.BERT_V2_PERMUTATIONS,
exclude_patterns=["adam", "Adam"]) exclude_patterns=["adam", "Adam"])
# Create a V2 checkpoint from the temporary checkpoint. if converted_model == "encoder":
model = _create_bert_model(bert_config) model = _create_bert_model(bert_config)
elif converted_model == "pretrainer":
model = _create_bert_pretrainer_model(bert_config)
else:
raise ValueError("Unsupported converted_model: %s" % converted_model)
# Create a V2 checkpoint from the temporary checkpoint.
tf1_checkpoint_converter_lib.create_v2_checkpoint(model, temporary_checkpoint, tf1_checkpoint_converter_lib.create_v2_checkpoint(model, temporary_checkpoint,
output_path, output_path,
checkpoint_model_name) checkpoint_model_name)
...@@ -106,13 +140,21 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint, ...@@ -106,13 +140,21 @@ def convert_checkpoint(bert_config, output_path, v1_checkpoint,
pass pass
def main(_): def main(argv):
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")
output_path = FLAGS.converted_checkpoint_path output_path = FLAGS.converted_checkpoint_path
v1_checkpoint = FLAGS.checkpoint_to_convert v1_checkpoint = FLAGS.checkpoint_to_convert
checkpoint_model_name = FLAGS.checkpoint_model_name checkpoint_model_name = FLAGS.checkpoint_model_name
converted_model = FLAGS.converted_model
bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file) bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
convert_checkpoint(bert_config, output_path, v1_checkpoint, convert_checkpoint(
checkpoint_model_name) bert_config=bert_config,
output_path=output_path,
v1_checkpoint=v1_checkpoint,
checkpoint_model_name=checkpoint_model_name,
converted_model=converted_model)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -421,7 +421,7 @@ def preprocess_text(inputs, remove_space=True, lower=False): ...@@ -421,7 +421,7 @@ def preprocess_text(inputs, remove_space=True, lower=False):
"""Preprocesses data by removing extra space and normalize data. """Preprocesses data by removing extra space and normalize data.
This method is used together with sentence piece tokenizer and is forked from: This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/master/albert/tokenization.py https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
Args: Args:
inputs: The input text. inputs: The input text.
...@@ -454,7 +454,7 @@ def encode_pieces(sp_model, text, sample=False): ...@@ -454,7 +454,7 @@ def encode_pieces(sp_model, text, sample=False):
"""Segements text into pieces. """Segements text into pieces.
This method is used together with sentence piece tokenizer and is forked from: This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/master/albert/tokenization.py https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
Args: Args:
...@@ -496,7 +496,7 @@ def encode_ids(sp_model, text, sample=False): ...@@ -496,7 +496,7 @@ def encode_ids(sp_model, text, sample=False):
"""Segments text and return token ids. """Segments text and return token ids.
This method is used together with sentence piece tokenizer and is forked from: This method is used together with sentence piece tokenizer and is forked from:
https://github.com/google-research/google-research/blob/master/albert/tokenization.py https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
Args: Args:
sp_model: A spm.SentencePieceProcessor object. sp_model: A spm.SentencePieceProcessor object.
......
...@@ -26,8 +26,9 @@ import tensorflow as tf ...@@ -26,8 +26,9 @@ import tensorflow as tf
from official.modeling import hyperparams from official.modeling import hyperparams
from official.modeling import tf_utils from official.modeling import tf_utils
from official.nlp.modeling import layers from official.nlp import keras_nlp
from official.nlp.modeling import networks from official.nlp.modeling import networks
from official.nlp.projects.bigbird import encoder as bigbird_encoder
@dataclasses.dataclass @dataclasses.dataclass
...@@ -60,18 +61,18 @@ class MobileBertEncoderConfig(hyperparams.Config): ...@@ -60,18 +61,18 @@ class MobileBertEncoderConfig(hyperparams.Config):
num_blocks: number of transformer block in the encoder model. num_blocks: number of transformer block in the encoder model.
hidden_size: the hidden size for the transformer block. hidden_size: the hidden size for the transformer block.
num_attention_heads: number of attention heads in the transformer block. num_attention_heads: number of attention heads in the transformer block.
intermediate_size: the size of the "intermediate" (a.k.a., feed intermediate_size: the size of the "intermediate" (a.k.a., feed forward)
forward) layer. layer.
intermediate_act_fn: the non-linear activation function to apply intermediate_act_fn: the non-linear activation function to apply to the
to the output of the intermediate/feed-forward layer. output of the intermediate/feed-forward layer.
hidden_dropout_prob: dropout probability for the hidden layers. hidden_dropout_prob: dropout probability for the hidden layers.
attention_probs_dropout_prob: dropout probability of the attention attention_probs_dropout_prob: dropout probability of the attention
probabilities. probabilities.
intra_bottleneck_size: the size of bottleneck. intra_bottleneck_size: the size of bottleneck.
initializer_range: The stddev of the truncated_normal_initializer for initializer_range: The stddev of the truncated_normal_initializer for
initializing all weight matrices. initializing all weight matrices.
key_query_shared_bottleneck: whether to share linear transformation for key_query_shared_bottleneck: whether to share linear transformation for keys
keys and queries. and queries.
num_feedforward_networks: number of stacked feed-forward networks. num_feedforward_networks: number of stacked feed-forward networks.
normalization_type: the type of normalization_type, only 'no_norm' and normalization_type: the type of normalization_type, only 'no_norm' and
'layer_norm' are supported. 'no_norm' represents the element-wise linear 'layer_norm' are supported. 'no_norm' represents the element-wise linear
...@@ -79,8 +80,6 @@ class MobileBertEncoderConfig(hyperparams.Config): ...@@ -79,8 +80,6 @@ class MobileBertEncoderConfig(hyperparams.Config):
MobileBERT paper. 'layer_norm' is used for the teacher model. MobileBERT paper. 'layer_norm' is used for the teacher model.
classifier_activation: if using the tanh activation for the final classifier_activation: if using the tanh activation for the final
representation of the [CLS] token in fine-tuning. representation of the [CLS] token in fine-tuning.
return_all_layers: if return all layer outputs.
return_attention_score: if return attention scores for each layer.
""" """
word_vocab_size: int = 30522 word_vocab_size: int = 30522
word_embed_size: int = 128 word_embed_size: int = 128
...@@ -99,8 +98,6 @@ class MobileBertEncoderConfig(hyperparams.Config): ...@@ -99,8 +98,6 @@ class MobileBertEncoderConfig(hyperparams.Config):
num_feedforward_networks: int = 1 num_feedforward_networks: int = 1
normalization_type: str = "layer_norm" normalization_type: str = "layer_norm"
classifier_activation: bool = True classifier_activation: bool = True
return_all_layers: bool = False
return_attention_score: bool = False
@dataclasses.dataclass @dataclasses.dataclass
...@@ -120,25 +117,47 @@ class AlbertEncoderConfig(hyperparams.Config): ...@@ -120,25 +117,47 @@ class AlbertEncoderConfig(hyperparams.Config):
initializer_range: float = 0.02 initializer_range: float = 0.02
@dataclasses.dataclass
class BigBirdEncoderConfig(hyperparams.Config):
"""BigBird encoder configuration."""
vocab_size: int = 50358
hidden_size: int = 768
num_layers: int = 12
num_attention_heads: int = 12
hidden_activation: str = "gelu"
intermediate_size: int = 3072
dropout_rate: float = 0.1
attention_dropout_rate: float = 0.1
max_position_embeddings: int = 4096
num_rand_blocks: int = 3
block_size: int = 64
type_vocab_size: int = 16
initializer_range: float = 0.02
embedding_size: Optional[int] = None
@dataclasses.dataclass @dataclasses.dataclass
class EncoderConfig(hyperparams.OneOfConfig): class EncoderConfig(hyperparams.OneOfConfig):
"""Encoder configuration.""" """Encoder configuration."""
type: Optional[str] = "bert" type: Optional[str] = "bert"
albert: AlbertEncoderConfig = AlbertEncoderConfig() albert: AlbertEncoderConfig = AlbertEncoderConfig()
bert: BertEncoderConfig = BertEncoderConfig() bert: BertEncoderConfig = BertEncoderConfig()
bigbird: BigBirdEncoderConfig = BigBirdEncoderConfig()
mobilebert: MobileBertEncoderConfig = MobileBertEncoderConfig() mobilebert: MobileBertEncoderConfig = MobileBertEncoderConfig()
ENCODER_CLS = { ENCODER_CLS = {
"bert": networks.BertEncoder, "bert": networks.BertEncoder,
"mobilebert": networks.MobileBERTEncoder, "mobilebert": networks.MobileBERTEncoder,
"albert": networks.AlbertTransformerEncoder, "albert": networks.AlbertEncoder,
"bigbird": bigbird_encoder.BigBirdEncoder,
} }
@gin.configurable @gin.configurable
def build_encoder(config: EncoderConfig, def build_encoder(
embedding_layer: Optional[layers.OnDeviceEmbedding] = None, config: EncoderConfig,
embedding_layer: Optional[keras_nlp.layers.OnDeviceEmbedding] = None,
encoder_cls=None, encoder_cls=None,
bypass_config: bool = False): bypass_config: bool = False):
"""Instantiate a Transformer encoder network from EncoderConfig. """Instantiate a Transformer encoder network from EncoderConfig.
...@@ -188,7 +207,8 @@ def build_encoder(config: EncoderConfig, ...@@ -188,7 +207,8 @@ def build_encoder(config: EncoderConfig,
pooled_output_dim=encoder_cfg.hidden_size, pooled_output_dim=encoder_cfg.hidden_size,
pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range), stddev=encoder_cfg.initializer_range),
return_all_layer_outputs=encoder_cfg.return_all_encoder_outputs) return_all_layer_outputs=encoder_cfg.return_all_encoder_outputs,
dict_outputs=True)
return encoder_cls(**kwargs) return encoder_cls(**kwargs)
if encoder_type == "mobilebert": if encoder_type == "mobilebert":
...@@ -205,12 +225,11 @@ def build_encoder(config: EncoderConfig, ...@@ -205,12 +225,11 @@ def build_encoder(config: EncoderConfig,
hidden_dropout_prob=encoder_cfg.hidden_dropout_prob, hidden_dropout_prob=encoder_cfg.hidden_dropout_prob,
attention_probs_dropout_prob=encoder_cfg.attention_probs_dropout_prob, attention_probs_dropout_prob=encoder_cfg.attention_probs_dropout_prob,
intra_bottleneck_size=encoder_cfg.intra_bottleneck_size, intra_bottleneck_size=encoder_cfg.intra_bottleneck_size,
initializer_range=encoder_cfg.initializer_range,
key_query_shared_bottleneck=encoder_cfg.key_query_shared_bottleneck, key_query_shared_bottleneck=encoder_cfg.key_query_shared_bottleneck,
num_feedforward_networks=encoder_cfg.num_feedforward_networks, num_feedforward_networks=encoder_cfg.num_feedforward_networks,
normalization_type=encoder_cfg.normalization_type, normalization_type=encoder_cfg.normalization_type,
classifier_activation=encoder_cfg.classifier_activation, classifier_activation=encoder_cfg.classifier_activation)
return_all_layers=encoder_cfg.return_all_layers,
return_attention_score=encoder_cfg.return_attention_score)
if encoder_type == "albert": if encoder_type == "albert":
return encoder_cls( return encoder_cls(
...@@ -226,7 +245,26 @@ def build_encoder(config: EncoderConfig, ...@@ -226,7 +245,26 @@ def build_encoder(config: EncoderConfig,
dropout_rate=encoder_cfg.dropout_rate, dropout_rate=encoder_cfg.dropout_rate,
attention_dropout_rate=encoder_cfg.attention_dropout_rate, attention_dropout_rate=encoder_cfg.attention_dropout_rate,
initializer=tf.keras.initializers.TruncatedNormal( initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range)) stddev=encoder_cfg.initializer_range),
dict_outputs=True)
if encoder_type == "bigbird":
return encoder_cls(
vocab_size=encoder_cfg.vocab_size,
hidden_size=encoder_cfg.hidden_size,
num_layers=encoder_cfg.num_layers,
num_attention_heads=encoder_cfg.num_attention_heads,
intermediate_size=encoder_cfg.intermediate_size,
activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
dropout_rate=encoder_cfg.dropout_rate,
attention_dropout_rate=encoder_cfg.attention_dropout_rate,
num_rand_blocks=encoder_cfg.num_rand_blocks,
block_size=encoder_cfg.block_size,
max_sequence_length=encoder_cfg.max_position_embeddings,
type_vocab_size=encoder_cfg.type_vocab_size,
initializer=tf.keras.initializers.TruncatedNormal(
stddev=encoder_cfg.initializer_range),
embedding_width=encoder_cfg.embedding_size)
# Uses the default BERTEncoder configuration schema to create the encoder. # Uses the default BERTEncoder configuration schema to create the encoder.
# If it does not match, please add a switch branch by the encoder type. # If it does not match, please add a switch branch by the encoder type.
...@@ -245,4 +283,5 @@ def build_encoder(config: EncoderConfig, ...@@ -245,4 +283,5 @@ def build_encoder(config: EncoderConfig,
stddev=encoder_cfg.initializer_range), stddev=encoder_cfg.initializer_range),
embedding_width=encoder_cfg.embedding_size, embedding_width=encoder_cfg.embedding_size,
embedding_layer=embedding_layer, embedding_layer=embedding_layer,
return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs) return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
dict_outputs=True)
# keras-nlp
## Layers
Layers are the fundamental building blocks for NLP models. They can be used to
assemble new layers, networks, or models.
* [TransformerEncoderBlock](layers/transformer_encoder_block.py) implements
an optionally masked transformer as described in
["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
* [OnDeviceEmbedding](layers/on_device_embedding.py) implements efficient
embedding lookups designed for TPU-based models.
* [PositionalEmbedding](layers/position_embedding.py) creates a positional
embedding as described in ["BERT: Pre-training of Deep Bidirectional
Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805).
* [SelfAttentionMask](layers/self_attention_mask.py) creates a 3D attention
mask from a 2D tensor mask.
* [MaskedLM](layers/masked_lm.py) implements a masked language model. It
assumes the embedding table variable is passed to it.
## Encoders
Encoders are combinations of layers (and possibly other encoders). They are
sub-units of models that would not be trained alone. It encapsulates common
network structures like a classification head or a transformer encoder into an
easily handled object with a standardized configuration.
* [BertEncoder](encoders/bert_encoder.py) implements a bi-directional
Transformer-based encoder as described in
["BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding
lookups, transformer layers and pooling layer.
...@@ -14,4 +14,5 @@ ...@@ -14,4 +14,5 @@
# ============================================================================== # ==============================================================================
"""Keras-NLP package definition.""" """Keras-NLP package definition."""
# pylint: disable=wildcard-import # pylint: disable=wildcard-import
from official.nlp.keras_nlp.layers import * from official.nlp.keras_nlp import encoders
from official.nlp.keras_nlp import layers
## Contributing to KerasNLP
Patches to KerasNLP are welcome!
The source-of-truth repository lives under
[TF Model Garden NLP](https://github.com/tensorflow/models/tree/master/official/nlp/keras_nlp),
and is mirrored as a read-only repository under
[keras-team/keras-nlp](https://github.com/keras-team/keras-nlp).
Contributions should be made as PRs to the TF Model Garden repository.
This is to ensure the codebase is rigorously tested with state-of-art models
on different accelerators.
In the long run, we will move development to the current repository `keras-team/keras-nlp`.
## :heavy_check_mark: Contributor checklist
1. Ensure you have signed the [Contributor License Agreement](https://cla.developers.google.com/about/google-individual?csw=1).
* All code contributors are required to sign a Contributor License Agreement.
* Please read this [troubleshooting guide](Contributor-License-Agreements#troubleshooting-clas)
if you encounter an issue.
2. Please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
3. Check if your changes are consistent with the [TensorFlow coding style](https://www.tensorflow.org/community/contribute/code_style).
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras-NLP layers package definition."""
from official.nlp.keras_nlp.encoders.bert_encoder import BertEncoder
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Bert encoder network."""
# pylint: disable=g-classes-have-attributes
import tensorflow as tf
from official.nlp.keras_nlp import layers
@tf.keras.utils.register_keras_serializable(package='keras_nlp')
class BertEncoder(tf.keras.Model):
"""Bi-directional Transformer-based encoder network.
This network implements a bi-directional Transformer-based encoder as
described in "BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
embedding lookups and transformer layers, but not the masked language model
or classification task networks.
The default values for this object are taken from the BERT-Base implementation
in "BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding".
*Note* that the network is constructed by
[Keras Functional API](https://keras.io/guides/functional_api/).
Arguments:
vocab_size: The size of the token vocabulary.
hidden_size: The size of the transformer hidden layers.
num_layers: The number of transformer layers.
num_attention_heads: The number of attention heads for each transformer. The
hidden size must be divisible by the number of attention heads.
max_sequence_length: The maximum sequence length that this encoder can
consume. If None, max_sequence_length uses the value from sequence length.
This determines the variable shape for positional embeddings.
type_vocab_size: The number of types that the 'type_ids' input can take.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network for each transformer.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network for each transformer.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: The dropout rate to use for the attention layers
within the transformer layers.
initializer: The initialzer to use for all weights in this encoder.
output_range: The sequence output range, [0, output_range), by slicing the
target sequence of the last transformer layer. `None` means the entire
target sequence will attend to the source sequence, which yeilds the full
output.
embedding_width: The width of the word embeddings. If the embedding width is
not equal to hidden size, embedding parameters will be factorized into two
matrices in the shape of ['vocab_size', 'embedding_width'] and
['embedding_width', 'hidden_size'] ('embedding_width' is usually much
smaller than 'hidden_size').
"""
def __init__(
self,
vocab_size,
hidden_size=768,
num_layers=12,
num_attention_heads=12,
max_sequence_length=512,
type_vocab_size=16,
inner_dim=3072,
inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
output_dropout=0.1,
attention_dropout=0.1,
initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
output_range=None,
embedding_width=None,
**kwargs):
activation = tf.keras.activations.get(inner_activation)
initializer = tf.keras.initializers.get(initializer)
self._self_setattr_tracking = False
self._config_dict = {
'vocab_size': vocab_size,
'hidden_size': hidden_size,
'num_layers': num_layers,
'num_attention_heads': num_attention_heads,
'max_sequence_length': max_sequence_length,
'type_vocab_size': type_vocab_size,
'inner_dim': inner_dim,
'inner_activation': tf.keras.activations.serialize(activation),
'output_dropout': output_dropout,
'attention_dropout': attention_dropout,
'initializer': tf.keras.initializers.serialize(initializer),
'output_range': output_range,
'embedding_width': embedding_width,
}
word_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_word_ids')
mask = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_mask')
type_ids = tf.keras.layers.Input(
shape=(None,), dtype=tf.int32, name='input_type_ids')
if embedding_width is None:
embedding_width = hidden_size
self._embedding_layer = self._build_embedding_layer()
word_embeddings = self._embedding_layer(word_ids)
# Always uses dynamic slicing for simplicity.
self._position_embedding_layer = layers.PositionEmbedding(
initializer=initializer,
max_length=max_sequence_length,
name='position_embedding')
position_embeddings = self._position_embedding_layer(word_embeddings)
self._type_embedding_layer = layers.OnDeviceEmbedding(
vocab_size=type_vocab_size,
embedding_width=embedding_width,
initializer=initializer,
use_one_hot=True,
name='type_embeddings')
type_embeddings = self._type_embedding_layer(type_ids)
embeddings = tf.keras.layers.Add()(
[word_embeddings, position_embeddings, type_embeddings])
self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
embeddings = self._embedding_norm_layer(embeddings)
embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))
# We project the 'embedding' output to 'hidden_size' if it is not already
# 'hidden_size'.
if embedding_width != hidden_size:
self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
'...x,xy->...y',
output_shape=hidden_size,
bias_axes='y',
kernel_initializer=initializer,
name='embedding_projection')
embeddings = self._embedding_projection(embeddings)
self._transformer_layers = []
data = embeddings
attention_mask = layers.SelfAttentionMask()(data, mask)
encoder_outputs = []
for i in range(num_layers):
if i == num_layers - 1 and output_range is not None:
transformer_output_range = output_range
else:
transformer_output_range = None
layer = layers.TransformerEncoderBlock(
num_attention_heads=num_attention_heads,
inner_dim=inner_dim,
inner_activation=inner_activation,
output_dropout=output_dropout,
attention_dropout=attention_dropout,
output_range=transformer_output_range,
kernel_initializer=initializer,
name='transformer/layer_%d' % i)
self._transformer_layers.append(layer)
data = layer([data, attention_mask])
encoder_outputs.append(data)
first_token_tensor = (
tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
encoder_outputs[-1]))
self._pooler_layer = tf.keras.layers.Dense(
units=hidden_size,
activation='tanh',
kernel_initializer=initializer,
name='pooler_transform')
cls_output = self._pooler_layer(first_token_tensor)
outputs = dict(
sequence_output=encoder_outputs[-1],
pooled_output=cls_output,
encoder_outputs=encoder_outputs,
)
super(BertEncoder, self).__init__(
inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
def get_embedding_table(self):
return self._embedding_layer.embeddings
def _build_embedding_layer(self):
embedding_width = self._config_dict[
'embedding_width'] or self._config_dict['hidden_size']
return layers.OnDeviceEmbedding(
vocab_size=self._config_dict['vocab_size'],
embedding_width=embedding_width,
initializer=self._config_dict['initializer'],
name='word_embeddings')
def get_embedding_layer(self):
return self._embedding_layer
def get_config(self):
return self._config_dict
@property
def transformer_layers(self):
"""List of Transformer layers in the encoder."""
return self._transformer_layers
@property
def pooler_layer(self):
"""The pooler dense layer after the transformer layers."""
return self._pooler_layer
@classmethod
def from_config(cls, config, custom_objects=None):
return cls(**config)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for transformer-based bert encoder network."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.keras_nlp.encoders import bert_encoder
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
# guarantees forward compatibility of this code for the V2 switchover.
@keras_parameterized.run_all_keras_modes
class BertEncoderTest(keras_parameterized.TestCase):
def tearDown(self):
super(BertEncoderTest, self).tearDown()
tf.keras.mixed_precision.experimental.set_policy("float32")
def test_network_creation(self):
hidden_size = 32
sequence_length = 21
# Create a small BertEncoder for testing.
test_network = bert_encoder.BertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network([word_ids, mask, type_ids])
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
self.assertIsInstance(test_network.transformer_layers, list)
self.assertLen(test_network.transformer_layers, 3)
self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# The default output dtype is float32.
self.assertAllEqual(tf.float32, data.dtype)
self.assertAllEqual(tf.float32, pooled.dtype)
def test_all_encoder_outputs_network_creation(self):
hidden_size = 32
sequence_length = 21
# Create a small BertEncoder for testing.
test_network = bert_encoder.BertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network([word_ids, mask, type_ids])
all_encoder_outputs = dict_outputs["encoder_outputs"]
pooled = dict_outputs["pooled_output"]
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertLen(all_encoder_outputs, 3)
for data in all_encoder_outputs:
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# The default output dtype is float32.
self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
self.assertAllEqual(tf.float32, pooled.dtype)
def test_network_creation_with_float16_dtype(self):
hidden_size = 32
sequence_length = 21
tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
# Create a small BertEncoder for testing.
test_network = bert_encoder.BertEncoder(
vocab_size=100,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network([word_ids, mask, type_ids])
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
expected_data_shape = [None, sequence_length, hidden_size]
expected_pooled_shape = [None, hidden_size]
self.assertAllEqual(expected_data_shape, data.shape.as_list())
self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
# If float_dtype is set to float16, the data output is float32 (from a layer
# norm) and pool output should be float16.
self.assertAllEqual(tf.float32, data.dtype)
self.assertAllEqual(tf.float16, pooled.dtype)
@parameterized.named_parameters(
("all_sequence", None, 21),
("output_range", 1, 1),
)
def test_network_invocation(self, output_range, out_seq_len):
hidden_size = 32
sequence_length = 21
vocab_size = 57
num_types = 7
# Create a small BertEncoder for testing.
test_network = bert_encoder.BertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
output_range=output_range)
# Create the inputs (note that the first dimension is implicit).
word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
dict_outputs = test_network([word_ids, mask, type_ids])
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
# Create a model based off of this network:
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
# Invoke the model. We can't validate the output data here (the model is too
# complex) but this will catch structural runtime errors.
batch_size = 3
word_id_data = np.random.randint(
vocab_size, size=(batch_size, sequence_length))
mask_data = np.random.randint(2, size=(batch_size, sequence_length))
type_id_data = np.random.randint(
num_types, size=(batch_size, sequence_length))
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[1], out_seq_len)
# Creates a BertEncoder with max_sequence_length != sequence_length
max_sequence_length = 128
test_network = bert_encoder.BertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types)
dict_outputs = test_network([word_ids, mask, type_ids])
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[1], sequence_length)
# Creates a BertEncoder with embedding_width != hidden_size
test_network = bert_encoder.BertEncoder(
vocab_size=vocab_size,
hidden_size=hidden_size,
max_sequence_length=max_sequence_length,
num_attention_heads=2,
num_layers=3,
type_vocab_size=num_types,
embedding_width=16)
dict_outputs = test_network([word_ids, mask, type_ids])
data = dict_outputs["sequence_output"]
pooled = dict_outputs["pooled_output"]
model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
outputs = model.predict([word_id_data, mask_data, type_id_data])
self.assertEqual(outputs[0].shape[-1], hidden_size)
self.assertTrue(hasattr(test_network, "_embedding_projection"))
def test_serialize_deserialize(self):
# Create a network object that sets all of its config options.
kwargs = dict(
vocab_size=100,
hidden_size=32,
num_layers=3,
num_attention_heads=2,
max_sequence_length=21,
type_vocab_size=12,
inner_dim=1223,
inner_activation="relu",
output_dropout=0.05,
attention_dropout=0.22,
initializer="glorot_uniform",
output_range=-1,
embedding_width=16)
network = bert_encoder.BertEncoder(**kwargs)
expected_config = dict(kwargs)
expected_config["inner_activation"] = tf.keras.activations.serialize(
tf.keras.activations.get(expected_config["inner_activation"]))
expected_config["initializer"] = tf.keras.initializers.serialize(
tf.keras.initializers.get(expected_config["initializer"]))
self.assertEqual(network.get_config(), expected_config)
# Create another network object from the first object's config.
new_network = bert_encoder.BertEncoder.from_config(network.get_config())
# Validate that the config can be forced to JSON.
_ = network.to_json()
# If the serialization was successful, the new config should match the old.
self.assertAllEqual(network.get_config(), new_network.get_config())
# Tests model saving/loading.
model_path = self.get_temp_dir() + "/model"
network.save(model_path)
_ = tf.keras.models.load_model(model_path)
if __name__ == "__main__":
tf.test.main()
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Keras-NLP layers package definition.""" """Keras-NLP layers package definition."""
from official.nlp.keras_nlp.layers.masked_lm import MaskedLM
from official.nlp.keras_nlp.layers.on_device_embedding import OnDeviceEmbedding
from official.nlp.keras_nlp.layers.position_embedding import PositionEmbedding from official.nlp.keras_nlp.layers.position_embedding import PositionEmbedding
from official.nlp.keras_nlp.layers.self_attention_mask import SelfAttentionMask from official.nlp.keras_nlp.layers.self_attention_mask import SelfAttentionMask
from official.nlp.keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock from official.nlp.keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Masked language model network."""
# pylint: disable=g-classes-have-attributes
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package='keras_nlp')
class MaskedLM(tf.keras.layers.Layer):
"""Masked language model network head for BERT modeling.
This layer implements a masked language model based on the provided
transformer based encoder. It assumes that the encoder network being passed
has a "get_embedding_table()" method.
Example:
```python
encoder=keras_nlp.BertEncoder(...)
lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
```
Arguments:
embedding_table: The embedding table from encoder network.
activation: The activation, if any, for the dense layer.
initializer: The initializer for the dense layer. Defaults to a Glorot
uniform initializer.
output: The output style for this layer. Can be either 'logits' or
'predictions'.
"""
def __init__(self,
embedding_table,
activation=None,
initializer='glorot_uniform',
output='logits',
name=None,
**kwargs):
super(MaskedLM, self).__init__(name=name, **kwargs)
self.embedding_table = embedding_table
self.activation = activation
self.initializer = tf.keras.initializers.get(initializer)
if output not in ('predictions', 'logits'):
raise ValueError(
('Unknown `output` value "%s". `output` can be either "logits" or '
'"predictions"') % output)
self._output_type = output
def build(self, input_shape):
self._vocab_size, hidden_size = self.embedding_table.shape
self.dense = tf.keras.layers.Dense(
hidden_size,
activation=self.activation,
kernel_initializer=self.initializer,
name='transform/dense')
self.layer_norm = tf.keras.layers.LayerNormalization(
axis=-1, epsilon=1e-12, name='transform/LayerNorm')
self.bias = self.add_weight(
'output_bias/bias',
shape=(self._vocab_size,),
initializer='zeros',
trainable=True)
super(MaskedLM, self).build(input_shape)
def call(self, sequence_data, masked_positions):
masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
lm_data = self.dense(masked_lm_input)
lm_data = self.layer_norm(lm_data)
lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
logits = tf.nn.bias_add(lm_data, self.bias)
masked_positions_length = masked_positions.shape[1] or tf.shape(
masked_positions)[1]
logits = tf.reshape(logits,
[-1, masked_positions_length, self._vocab_size])
if self._output_type == 'logits':
return logits
return tf.nn.log_softmax(logits)
def get_config(self):
raise NotImplementedError('MaskedLM cannot be directly serialized because '
'it has variable sharing logic.')
def _gather_indexes(self, sequence_tensor, positions):
"""Gathers the vectors at the specific positions.
Args:
sequence_tensor: Sequence output of `BertModel` layer of shape
(`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
hidden units of `BertModel` layer.
positions: Positions ids of tokens in sequence to mask for pretraining
of with dimension (batch_size, num_predictions) where
`num_predictions` is maximum number of tokens to mask out and predict
per each sequence.
Returns:
Masked out sequence tensor of shape (batch_size * num_predictions,
num_hidden).
"""
sequence_shape = tf.shape(sequence_tensor)
batch_size, seq_length = sequence_shape[0], sequence_shape[1]
width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
flat_offsets = tf.reshape(
tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
flat_positions = tf.reshape(positions + flat_offsets, [-1])
flat_sequence_tensor = tf.reshape(sequence_tensor,
[batch_size * seq_length, width])
output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
return output_tensor
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Keras-based one-hot embedding layer."""
# pylint: disable=g-classes-have-attributes
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package="keras_nlp")
class OnDeviceEmbedding(tf.keras.layers.Layer):
"""Performs an embedding lookup suitable for accelerator devices.
This layer uses either tf.gather or tf.one_hot to translate integer indices to
float embeddings.
Arguments:
vocab_size: Number of elements in the vocabulary.
embedding_width: Output size of the embedding layer.
initializer: The initializer to use for the embedding weights. Defaults to
"glorot_uniform".
use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
lookup. Defaults to False (that is, using tf.gather). Setting this option
to True may improve performance, especially on small vocabulary sizes, but
will generally require more memory.
scale_factor: Whether to scale the output embeddings. Defaults to None (that
is, not to scale). Setting this option to a float will let values in
output embeddings multiplied by scale_factor.
"""
def __init__(self,
vocab_size,
embedding_width,
initializer="glorot_uniform",
use_one_hot=False,
scale_factor=None,
**kwargs):
super(OnDeviceEmbedding, self).__init__(**kwargs)
self._vocab_size = vocab_size
self._embedding_width = embedding_width
self._initializer = initializer
self._use_one_hot = use_one_hot
self._scale_factor = scale_factor
def get_config(self):
config = {
"vocab_size": self._vocab_size,
"embedding_width": self._embedding_width,
"initializer": self._initializer,
"use_one_hot": self._use_one_hot,
"scale_factor": self._scale_factor,
}
base_config = super(OnDeviceEmbedding, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
self.embeddings = self.add_weight(
"embeddings",
shape=[self._vocab_size, self._embedding_width],
initializer=self._initializer,
dtype=tf.float32)
super(OnDeviceEmbedding, self).build(input_shape)
def call(self, inputs):
flat_inputs = tf.reshape(inputs, [-1])
if self._use_one_hot:
one_hot_data = tf.one_hot(
flat_inputs, depth=self._vocab_size, dtype=self.embeddings.dtype)
embeddings = tf.matmul(one_hot_data, self.embeddings)
else:
embeddings = tf.gather(self.embeddings, flat_inputs)
embeddings = tf.reshape(
embeddings,
# Work around b/142213824: prefer concat to shape over a Python list.
tf.concat([tf.shape(inputs), [self._embedding_width]], axis=0))
embeddings.set_shape(inputs.shape.as_list() + [self._embedding_width])
if self._scale_factor:
embeddings *= self._scale_factor
return embeddings
...@@ -18,7 +18,7 @@ import numpy as np ...@@ -18,7 +18,7 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.nlp.modeling.layers import on_device_embedding from official.nlp.keras_nlp.layers import on_device_embedding
# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
...@@ -192,7 +192,8 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase): ...@@ -192,7 +192,8 @@ class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
vocab_size = 31 vocab_size = 31
embedding_width = 27 embedding_width = 27
test_layer = on_device_embedding.OnDeviceEmbedding( test_layer = on_device_embedding.OnDeviceEmbedding(
vocab_size=vocab_size, embedding_width=embedding_width, use_scale=True) vocab_size=vocab_size, embedding_width=embedding_width,
scale_factor=embedding_width**0.5)
# Create a 2-dimensional input (the first dimension is implicit). # Create a 2-dimensional input (the first dimension is implicit).
sequence_length = 23 sequence_length = 23
input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32) input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
import tensorflow as tf import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package="Text") @tf.keras.utils.register_keras_serializable(package="keras_nlp")
class PositionEmbedding(tf.keras.layers.Layer): class PositionEmbedding(tf.keras.layers.Layer):
"""Creates a positional embedding. """Creates a positional embedding.
......
...@@ -14,11 +14,10 @@ ...@@ -14,11 +14,10 @@
# ============================================================================== # ==============================================================================
"""Keras-based TransformerEncoder block layer.""" """Keras-based TransformerEncoder block layer."""
# Import libraries
import tensorflow as tf import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package="Text") @tf.keras.utils.register_keras_serializable(package="keras_nlp")
class TransformerEncoderBlock(tf.keras.layers.Layer): class TransformerEncoderBlock(tf.keras.layers.Layer):
"""TransformerEncoderBlock layer. """TransformerEncoderBlock layer.
...@@ -241,6 +240,9 @@ class TransformerEncoderBlock(tf.keras.layers.Layer): ...@@ -241,6 +240,9 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
input_tensor, attention_mask = (inputs, None) input_tensor, attention_mask = (inputs, None)
if self._output_range: if self._output_range:
if self._norm_first:
source_tensor = input_tensor[:, 0:self._output_range, :]
input_tensor = self._attention_layer_norm(input_tensor)
target_tensor = input_tensor[:, 0:self._output_range, :] target_tensor = input_tensor[:, 0:self._output_range, :]
attention_mask = attention_mask[:, 0:self._output_range, :] attention_mask = attention_mask[:, 0:self._output_range, :]
else: else:
......
...@@ -14,11 +14,6 @@ ...@@ -14,11 +14,6 @@
# ============================================================================== # ==============================================================================
"""Tests for Keras-based transformer block layer.""" """Tests for Keras-based transformer block layer."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# Import libraries
from absl.testing import parameterized from absl.testing import parameterized
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
...@@ -142,6 +137,34 @@ class TransformerEncoderBlockLayerTest(keras_parameterized.TestCase): ...@@ -142,6 +137,34 @@ class TransformerEncoderBlockLayerTest(keras_parameterized.TestCase):
self.assertAllClose( self.assertAllClose(
new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003) new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
def test_layer_output_range_with_pre_norm(self, transformer_cls):
test_layer = transformer_cls(
num_attention_heads=10, inner_dim=2048,
inner_activation='relu', norm_first=True)
sequence_length = 21
width = 80
batch_size = 6
input_data = 10 * np.random.random_sample(
(batch_size, sequence_length, width))
mask_data = np.random.randint(
2, size=(batch_size, sequence_length, sequence_length))
output_tensor = test_layer([input_data, mask_data])
# The layer only attends to the first token and outputs the first token
# embeeding.
new_layer = transformer_cls(
num_attention_heads=10,
inner_dim=2048,
inner_activation='relu',
output_range=1,
norm_first=True)
_ = new_layer([input_data, mask_data])
new_layer.set_weights(test_layer.get_weights())
new_output_tensor = new_layer([input_data, mask_data])
self.assertAllClose(
new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
def test_layer_invocation_with_float16_dtype(self, transformer_cls): def test_layer_invocation_with_float16_dtype(self, transformer_cls):
tf.keras.mixed_precision.experimental.set_policy('mixed_float16') tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
test_layer = transformer_cls( test_layer = transformer_cls(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment