"git@developer.sourcefind.cn:OpenDAS/torch-spline-conv.git" did not exist on "05b6540225a7dc1df52b1574db4456eeddb4e588"
Commit 83f0a576 authored by Chen Chen's avatar Chen Chen Committed by saberkun
Browse files

Internal change

PiperOrigin-RevId: 282096004
parent 986ffac4
...@@ -94,7 +94,8 @@ def run_customized_training_loop( ...@@ -94,7 +94,8 @@ def run_customized_training_loop(
metric_fn=None, metric_fn=None,
init_checkpoint=None, init_checkpoint=None,
custom_callbacks=None, custom_callbacks=None,
run_eagerly=False): run_eagerly=False,
sub_model_export_name=None):
"""Run BERT pretrain model training using low-level API. """Run BERT pretrain model training using low-level API.
Arguments: Arguments:
...@@ -131,6 +132,11 @@ def run_customized_training_loop( ...@@ -131,6 +132,11 @@ def run_customized_training_loop(
methods are invoked during training. methods are invoked during training.
run_eagerly: Whether to run model training in pure eager execution. This run_eagerly: Whether to run model training in pure eager execution. This
should be disable for TPUStrategy. should be disable for TPUStrategy.
sub_model_export_name: If not None, will export `sub_model` returned by
`model_fn` into checkpoint files. The name of intermediate checkpoint
file is {sub_model_export_name}_step_{step}.ckpt and the last
checkpint's name is {sub_model_export_name}.ckpt;
if None, `sub_model` will not be exported as checkpoint.
Returns: Returns:
Trained model. Trained model.
...@@ -139,6 +145,8 @@ def run_customized_training_loop( ...@@ -139,6 +145,8 @@ def run_customized_training_loop(
ValueError: (1) When model returned by `model_fn` does not have optimizer ValueError: (1) When model returned by `model_fn` does not have optimizer
attribute or when required parameters are set to none. (2) eval args are attribute or when required parameters are set to none. (2) eval args are
not specified correctly. (3) metric_fn must be a callable if specified. not specified correctly. (3) metric_fn must be a callable if specified.
(4) sub_model_checkpoint_name is specified, but `sub_model` returned
by `model_fn` is None.
""" """
if _sentinel is not None: if _sentinel is not None:
...@@ -191,6 +199,10 @@ def run_customized_training_loop( ...@@ -191,6 +199,10 @@ def run_customized_training_loop(
if not hasattr(model, 'optimizer'): if not hasattr(model, 'optimizer'):
raise ValueError('User should set optimizer attribute to model ' raise ValueError('User should set optimizer attribute to model '
'inside `model_fn`.') 'inside `model_fn`.')
if sub_model_export_name and sub_model is None:
raise ValueError('sub_model_export_name is specified as %s, but '
'sub_model is None.' % sub_model_export_name)
optimizer = model.optimizer optimizer = model.optimizer
use_float16 = isinstance( use_float16 = isinstance(
optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer) optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer)
...@@ -326,6 +338,9 @@ def run_customized_training_loop( ...@@ -326,6 +338,9 @@ def run_customized_training_loop(
# Training loop starts here. # Training loop starts here.
checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer) checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
sub_model_checkpoint = tf.train.Checkpoint(
model=sub_model) if sub_model_export_name else None
latest_checkpoint_file = tf.train.latest_checkpoint(model_dir) latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
if latest_checkpoint_file: if latest_checkpoint_file:
logging.info( logging.info(
...@@ -382,7 +397,10 @@ def run_customized_training_loop( ...@@ -382,7 +397,10 @@ def run_customized_training_loop(
if current_step < total_training_steps: if current_step < total_training_steps:
_save_checkpoint(checkpoint, model_dir, _save_checkpoint(checkpoint, model_dir,
checkpoint_name.format(step=current_step)) checkpoint_name.format(step=current_step))
if sub_model_export_name:
_save_checkpoint(
sub_model_checkpoint, model_dir,
'%s_step_%d.ckpt' % (sub_model_export_name, current_step))
if eval_input_fn: if eval_input_fn:
logging.info('Running evaluation after step: %s.', current_step) logging.info('Running evaluation after step: %s.', current_step)
_run_evaluation(current_step, _run_evaluation(current_step,
...@@ -393,6 +411,9 @@ def run_customized_training_loop( ...@@ -393,6 +411,9 @@ def run_customized_training_loop(
_save_checkpoint(checkpoint, model_dir, _save_checkpoint(checkpoint, model_dir,
checkpoint_name.format(step=current_step)) checkpoint_name.format(step=current_step))
if sub_model_export_name:
_save_checkpoint(sub_model_checkpoint, model_dir,
'%s.ckpt' % sub_model_export_name)
if eval_input_fn: if eval_input_fn:
logging.info('Running final evaluation after training is complete.') logging.info('Running final evaluation after training is complete.')
......
...@@ -77,37 +77,6 @@ def export_bert_model(model_export_path: typing.Text, ...@@ -77,37 +77,6 @@ def export_bert_model(model_export_path: typing.Text,
model.save(model_export_path, include_optimizer=False, save_format='tf') model.save(model_export_path, include_optimizer=False, save_format='tf')
def export_pretraining_checkpoint(
checkpoint_dir: typing.Text,
model: tf.keras.Model,
checkpoint_name: typing.Optional[
typing.Text] = 'pretrained/bert_model.ckpt'):
"""Exports BERT model for as a checkpoint without optimizer.
Arguments:
checkpoint_dir: Path to where training model checkpoints are stored.
model: Keras model object to export.
checkpoint_name: File name or suffix path to export pretrained checkpoint.
Raises:
ValueError when either checkpoint_dir or model is not specified.
"""
if not checkpoint_dir:
raise ValueError('checkpoint_dir must be specified.')
if not isinstance(model, tf.keras.Model):
raise ValueError('model must be a tf.keras.Model object.')
checkpoint = tf.train.Checkpoint(model=model)
latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
assert latest_checkpoint_file
logging.info('Checkpoint file %s found and restoring from '
'checkpoint', latest_checkpoint_file)
status = checkpoint.restore(latest_checkpoint_file)
status.assert_existing_objects_matched().expect_partial()
saved_path = checkpoint.save(os.path.join(checkpoint_dir, checkpoint_name))
logging.info('Exporting the model as a new TF checkpoint: %s', saved_path)
class BertModelCheckpoint(tf.keras.callbacks.Callback): class BertModelCheckpoint(tf.keras.callbacks.Callback):
"""Keras callback that saves model at the end of every epoch.""" """Keras callback that saves model at the end of every epoch."""
......
...@@ -126,16 +126,9 @@ def run_customized_training(strategy, ...@@ -126,16 +126,9 @@ def run_customized_training(strategy,
train_input_fn=train_input_fn, train_input_fn=train_input_fn,
steps_per_epoch=steps_per_epoch, steps_per_epoch=steps_per_epoch,
steps_per_loop=steps_per_loop, steps_per_loop=steps_per_loop,
epochs=epochs) epochs=epochs,
sub_model_export_name='pretrained/bert_model')
# Creates the BERT core model outside distribution strategy scope.
_, core_model = bert_models.pretrain_model(bert_config, max_seq_length,
max_predictions_per_seq)
# Restores the core model from model checkpoints and get a new checkpoint only
# contains the core model.
model_saving_utils.export_pretraining_checkpoint(
checkpoint_dir=model_dir, model=core_model)
return trained_model return trained_model
......
...@@ -18,139 +18,26 @@ from __future__ import absolute_import ...@@ -18,139 +18,26 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import copy
import tensorflow as tf import tensorflow as tf
import tensorflow_hub as hub import tensorflow_hub as hub
from official.modeling import tf_utils from official.modeling import tf_utils
from official.nlp import bert_modeling as modeling from official.nlp.modeling import losses
from official.nlp.modeling import networks from official.nlp.modeling import networks
from official.nlp.modeling.networks import bert_classifier from official.nlp.modeling.networks import bert_classifier
from official.nlp.modeling.networks import bert_pretrainer
from official.nlp.modeling.networks import bert_span_labeler from official.nlp.modeling.networks import bert_span_labeler
def gather_indexes(sequence_tensor, positions):
"""Gathers the vectors at the specific positions.
Args:
sequence_tensor: Sequence output of `BertModel` layer of shape
(`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
hidden units of `BertModel` layer.
positions: Positions ids of tokens in sequence to mask for pretraining of
with dimension (batch_size, max_predictions_per_seq) where
`max_predictions_per_seq` is maximum number of tokens to mask out and
predict per each sequence.
Returns:
Masked out sequence tensor of shape (batch_size * max_predictions_per_seq,
num_hidden).
"""
sequence_shape = tf_utils.get_shape_list(
sequence_tensor, name='sequence_output_tensor')
batch_size = sequence_shape[0]
seq_length = sequence_shape[1]
width = sequence_shape[2]
flat_offsets = tf.keras.backend.reshape(
tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
flat_positions = tf.keras.backend.reshape(positions + flat_offsets, [-1])
flat_sequence_tensor = tf.keras.backend.reshape(
sequence_tensor, [batch_size * seq_length, width])
output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
return output_tensor
class BertPretrainLayer(tf.keras.layers.Layer):
"""Wrapper layer for pre-training a BERT model.
This layer wraps an existing `bert_layer` which is a Keras Layer.
It outputs `sequence_output` from TransformerBlock sub-layer and
`sentence_output` which are suitable for feeding into a BertPretrainLoss
layer. This layer can be used along with an unsupervised input to
pre-train the embeddings for `bert_layer`.
"""
def __init__(self,
config,
bert_layer,
initializer=None,
float_type=tf.float32,
**kwargs):
super(BertPretrainLayer, self).__init__(**kwargs)
self.config = copy.deepcopy(config)
self.float_type = float_type
self.embedding_table = bert_layer.embedding_lookup.embeddings
self.num_next_sentence_label = 2
if initializer:
self.initializer = initializer
else:
self.initializer = tf.keras.initializers.TruncatedNormal(
stddev=self.config.initializer_range)
def build(self, unused_input_shapes):
"""Implements build() for the layer."""
self.output_bias = self.add_weight(
shape=[self.config.vocab_size],
name='predictions/output_bias',
initializer=tf.keras.initializers.Zeros())
self.lm_dense = tf.keras.layers.Dense(
self.config.hidden_size,
activation=tf_utils.get_activation(self.config.hidden_act),
kernel_initializer=self.initializer,
name='predictions/transform/dense')
self.lm_layer_norm = tf.keras.layers.LayerNormalization(
axis=-1, epsilon=1e-12, name='predictions/transform/LayerNorm')
# Next sentence binary classification dense layer including bias to match
# TF1.x BERT variable shapes.
with tf.name_scope('seq_relationship'):
self.next_seq_weights = self.add_weight(
shape=[self.num_next_sentence_label, self.config.hidden_size],
name='output_weights',
initializer=self.initializer)
self.next_seq_bias = self.add_weight(
shape=[self.num_next_sentence_label],
name='output_bias',
initializer=tf.keras.initializers.Zeros())
super(BertPretrainLayer, self).build(unused_input_shapes)
def __call__(self,
pooled_output,
sequence_output=None,
masked_lm_positions=None,
**kwargs):
inputs = tf_utils.pack_inputs(
[pooled_output, sequence_output, masked_lm_positions])
return super(BertPretrainLayer, self).__call__(inputs, **kwargs)
def call(self, inputs):
"""Implements call() for the layer."""
unpacked_inputs = tf_utils.unpack_inputs(inputs)
pooled_output = unpacked_inputs[0]
sequence_output = unpacked_inputs[1]
masked_lm_positions = unpacked_inputs[2]
mask_lm_input_tensor = gather_indexes(sequence_output, masked_lm_positions)
lm_output = self.lm_dense(mask_lm_input_tensor)
lm_output = self.lm_layer_norm(lm_output)
lm_output = tf.matmul(lm_output, self.embedding_table, transpose_b=True)
lm_output = tf.nn.bias_add(lm_output, self.output_bias)
lm_output = tf.nn.log_softmax(lm_output, axis=-1)
logits = tf.matmul(pooled_output, self.next_seq_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, self.next_seq_bias)
sentence_output = tf.nn.log_softmax(logits, axis=-1)
return (lm_output, sentence_output)
class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer): class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
"""Returns layer that computes custom loss and metrics for pretraining.""" """Returns layer that computes custom loss and metrics for pretraining."""
def __init__(self, bert_config, **kwargs): def __init__(self, vocab_size, **kwargs):
super(BertPretrainLossAndMetricLayer, self).__init__(**kwargs) super(BertPretrainLossAndMetricLayer, self).__init__(**kwargs)
self.config = copy.deepcopy(bert_config) self._vocab_size = vocab_size
self.config = {
'vocab_size': vocab_size,
}
def __call__(self, def __call__(self,
lm_output, lm_output,
...@@ -167,8 +54,8 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer): ...@@ -167,8 +54,8 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
self).__call__(inputs, **kwargs) self).__call__(inputs, **kwargs)
def _add_metrics(self, lm_output, lm_labels, lm_label_weights, def _add_metrics(self, lm_output, lm_labels, lm_label_weights,
lm_per_example_loss, sentence_output, sentence_labels, lm_example_loss, sentence_output, sentence_labels,
sentence_per_example_loss): next_sentence_loss):
"""Adds metrics.""" """Adds metrics."""
masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy( masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
lm_labels, lm_output) lm_labels, lm_output)
...@@ -178,8 +65,6 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer): ...@@ -178,8 +65,6 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
self.add_metric( self.add_metric(
masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean') masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean')
lm_example_loss = tf.reshape(lm_per_example_loss, [-1])
lm_example_loss = tf.reduce_mean(lm_example_loss * lm_label_weights)
self.add_metric(lm_example_loss, name='lm_example_loss', aggregation='mean') self.add_metric(lm_example_loss, name='lm_example_loss', aggregation='mean')
next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy( next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
...@@ -189,9 +74,8 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer): ...@@ -189,9 +74,8 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
name='next_sentence_accuracy', name='next_sentence_accuracy',
aggregation='mean') aggregation='mean')
next_sentence_mean_loss = tf.reduce_mean(sentence_per_example_loss)
self.add_metric( self.add_metric(
next_sentence_mean_loss, name='next_sentence_loss', aggregation='mean') next_sentence_loss, name='next_sentence_loss', aggregation='mean')
def call(self, inputs): def call(self, inputs):
"""Implements call() for the layer.""" """Implements call() for the layer."""
...@@ -199,31 +83,21 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer): ...@@ -199,31 +83,21 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
lm_output = unpacked_inputs[0] lm_output = unpacked_inputs[0]
sentence_output = unpacked_inputs[1] sentence_output = unpacked_inputs[1]
lm_label_ids = unpacked_inputs[2] lm_label_ids = unpacked_inputs[2]
lm_label_ids = tf.keras.backend.reshape(lm_label_ids, [-1])
lm_label_ids_one_hot = tf.keras.backend.one_hot(lm_label_ids,
self.config.vocab_size)
lm_label_weights = tf.keras.backend.cast(unpacked_inputs[3], tf.float32) lm_label_weights = tf.keras.backend.cast(unpacked_inputs[3], tf.float32)
lm_label_weights = tf.keras.backend.reshape(lm_label_weights, [-1])
lm_per_example_loss = -tf.keras.backend.sum(
lm_output * lm_label_ids_one_hot, axis=[-1])
numerator = tf.keras.backend.sum(lm_label_weights * lm_per_example_loss)
denominator = tf.keras.backend.sum(lm_label_weights) + 1e-5
mask_label_loss = numerator / denominator
sentence_labels = unpacked_inputs[4] sentence_labels = unpacked_inputs[4]
sentence_labels = tf.keras.backend.reshape(sentence_labels, [-1])
sentence_label_one_hot = tf.keras.backend.one_hot(sentence_labels, 2) mask_label_loss = losses.weighted_sparse_categorical_crossentropy_loss(
per_example_loss_sentence = -tf.keras.backend.sum( labels=lm_label_ids, predictions=lm_output, weights=lm_label_weights)
sentence_label_one_hot * sentence_output, axis=-1) sentence_loss = losses.weighted_sparse_categorical_crossentropy_loss(
sentence_loss = tf.keras.backend.mean(per_example_loss_sentence) labels=sentence_labels, predictions=sentence_output)
loss = mask_label_loss + sentence_loss loss = mask_label_loss + sentence_loss
batch_shape = tf.slice(tf.keras.backend.shape(sentence_labels), [0], [1])
# TODO(hongkuny): Avoids the hack and switches add_loss. # TODO(hongkuny): Avoids the hack and switches add_loss.
final_loss = tf.fill( final_loss = tf.fill(batch_shape, loss)
tf.keras.backend.shape(per_example_loss_sentence), loss)
self._add_metrics(lm_output, lm_label_ids, lm_label_weights, self._add_metrics(lm_output, lm_label_ids, lm_label_weights,
lm_per_example_loss, sentence_output, sentence_labels, mask_label_loss, sentence_output, sentence_labels,
per_example_loss_sentence) sentence_loss)
return final_loss return final_loss
...@@ -268,13 +142,12 @@ def pretrain_model(bert_config, ...@@ -268,13 +142,12 @@ def pretrain_model(bert_config,
seq_length: Maximum sequence length of the training data. seq_length: Maximum sequence length of the training data.
max_predictions_per_seq: Maximum number of tokens in sequence to mask out max_predictions_per_seq: Maximum number of tokens in sequence to mask out
and use for pretraining. and use for pretraining.
initializer: Initializer for weights in BertPretrainLayer. initializer: Initializer for weights in BertPretrainer.
Returns: Returns:
Pretraining model as well as core BERT submodel from which to save Pretraining model as well as core BERT submodel from which to save
weights after pretraining. weights after pretraining.
""" """
input_word_ids = tf.keras.layers.Input( input_word_ids = tf.keras.layers.Input(
shape=(seq_length,), name='input_word_ids', dtype=tf.int32) shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
input_mask = tf.keras.layers.Input( input_mask = tf.keras.layers.Input(
...@@ -285,38 +158,34 @@ def pretrain_model(bert_config, ...@@ -285,38 +158,34 @@ def pretrain_model(bert_config,
shape=(max_predictions_per_seq,), shape=(max_predictions_per_seq,),
name='masked_lm_positions', name='masked_lm_positions',
dtype=tf.int32) dtype=tf.int32)
masked_lm_ids = tf.keras.layers.Input(
shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
masked_lm_weights = tf.keras.layers.Input( masked_lm_weights = tf.keras.layers.Input(
shape=(max_predictions_per_seq,), shape=(max_predictions_per_seq,),
name='masked_lm_weights', name='masked_lm_weights',
dtype=tf.int32) dtype=tf.int32)
next_sentence_labels = tf.keras.layers.Input( next_sentence_labels = tf.keras.layers.Input(
shape=(1,), name='next_sentence_labels', dtype=tf.int32) shape=(1,), name='next_sentence_labels', dtype=tf.int32)
masked_lm_ids = tf.keras.layers.Input(
shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
bert_submodel_name = 'bert_model' transformer_encoder = _get_transformer_encoder(bert_config, seq_length)
bert_submodel = modeling.get_bert_model( if initializer is None:
input_word_ids, initializer = tf.keras.initializers.TruncatedNormal(
input_mask, stddev=bert_config.initializer_range)
input_type_ids, pretrainer_model = bert_pretrainer.BertPretrainer(
name=bert_submodel_name, network=transformer_encoder,
config=bert_config) num_classes=2, # The next sentence prediction label has two classes.
pooled_output = bert_submodel.outputs[0] num_token_predictions=max_predictions_per_seq,
sequence_output = bert_submodel.outputs[1]
pretrain_layer = BertPretrainLayer(
bert_config,
bert_submodel.get_layer(bert_submodel_name),
initializer=initializer, initializer=initializer,
name='cls') output='predictions')
lm_output, sentence_output = pretrain_layer(pooled_output, sequence_output,
masked_lm_positions) lm_output, sentence_output = pretrainer_model(
[input_word_ids, input_mask, input_type_ids, masked_lm_positions])
pretrain_loss_layer = BertPretrainLossAndMetricLayer(bert_config) pretrain_loss_layer = BertPretrainLossAndMetricLayer(
vocab_size=bert_config.vocab_size)
output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids, output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids,
masked_lm_weights, next_sentence_labels) masked_lm_weights, next_sentence_labels)
keras_model = tf.keras.Model(
return tf.keras.Model(
inputs={ inputs={
'input_word_ids': input_word_ids, 'input_word_ids': input_word_ids,
'input_mask': input_mask, 'input_mask': input_mask,
...@@ -326,7 +195,8 @@ def pretrain_model(bert_config, ...@@ -326,7 +195,8 @@ def pretrain_model(bert_config,
'masked_lm_weights': masked_lm_weights, 'masked_lm_weights': masked_lm_weights,
'next_sentence_labels': next_sentence_labels, 'next_sentence_labels': next_sentence_labels,
}, },
outputs=output_loss), bert_submodel outputs=output_loss)
return keras_model, transformer_encoder
class BertSquadLogitsLayer(tf.keras.layers.Layer): class BertSquadLogitsLayer(tf.keras.layers.Layer):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment