Unverified Commit 51e60bab authored by Ayushman Kumar's avatar Ayushman Kumar Committed by GitHub
Browse files

Merge pull request #3 from tensorflow/master

Updated
parents 7653185e 7d86c317
...@@ -6,8 +6,6 @@ The [official models](official) are a collection of example models that use Tens ...@@ -6,8 +6,6 @@ The [official models](official) are a collection of example models that use Tens
The [research models](https://github.com/tensorflow/models/tree/master/research) are a large collection of models implemented in TensorFlow by researchers. They are not officially supported or available in release branches; it is up to the individual researchers to maintain the models and/or provide support on issues and pull requests. The [research models](https://github.com/tensorflow/models/tree/master/research) are a large collection of models implemented in TensorFlow by researchers. They are not officially supported or available in release branches; it is up to the individual researchers to maintain the models and/or provide support on issues and pull requests.
The [samples folder](samples) contains code snippets and smaller models that demonstrate features of TensorFlow, including code presented in various blog posts.
The [tutorials folder](tutorials) is a collection of models described in the [TensorFlow tutorials](https://www.tensorflow.org/tutorials/). The [tutorials folder](tutorials) is a collection of models described in the [TensorFlow tutorials](https://www.tensorflow.org/tutorials/).
## Contribution guidelines ## Contribution guidelines
......
...@@ -163,21 +163,6 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark): ...@@ -163,21 +163,6 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.dtype = 'fp32' FLAGS.dtype = 'fp32'
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_force_v1_path(self):
"""No dist strat forced v1 execution path."""
self._setup()
FLAGS.distribution_strategy = 'off'
FLAGS.num_gpus = 1
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_force_v1_path')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark()
def benchmark_2_gpu(self): def benchmark_2_gpu(self):
"""Test keras based model with eager and distribution strategies.""" """Test keras based model with eager and distribution strategies."""
self._setup() self._setup()
...@@ -261,17 +246,6 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -261,17 +246,6 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.batch_size = 128 FLAGS.batch_size = 128
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_force_v1_path(self):
"""Test 1 gpu using forced v1 execution path."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'one_device'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_force_v1_path')
FLAGS.batch_size = 128
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark()
def benchmark_graph_1_gpu(self): def benchmark_graph_1_gpu(self):
"""Test 1 gpu graph.""" """Test 1 gpu graph."""
self._setup() self._setup()
...@@ -316,33 +290,6 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -316,33 +290,6 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.distribution_strategy = 'off' FLAGS.distribution_strategy = 'off'
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_force_v1_path(self):
"""No dist strat but forced v1 execution path."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = 128
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_force_v1_path')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_force_v1_path_run_eagerly(self):
"""Forced v1 execution path and forced eager."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.batch_size = 128
FLAGS.model_dir = self._get_model_dir(
'benchmark_1_gpu_no_dist_strat_force_v1_path_run_eagerly')
FLAGS.dtype = 'fp32'
FLAGS.enable_eager = True
FLAGS.run_eagerly = True
FLAGS.distribution_strategy = 'off'
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark()
def benchmark_2_gpu(self): def benchmark_2_gpu(self):
"""Test 2 gpu.""" """Test 2 gpu."""
self._setup() self._setup()
...@@ -409,19 +356,6 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark): ...@@ -409,19 +356,6 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.data_format = 'channels_last' FLAGS.data_format = 'channels_last'
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_cpu_no_dist_strat_force_v1_path(self):
"""Test cpu without dist strat and force v1 in model.compile."""
self._setup()
FLAGS.num_gpus = 0
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'off'
FLAGS.model_dir = self._get_model_dir(
'benchmark_cpu_no_dist_strat_force_v1_path')
FLAGS.batch_size = 128
FLAGS.data_format = 'channels_last'
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark()
def benchmark_graph_cpu_no_dist_strat(self): def benchmark_graph_cpu_no_dist_strat(self):
"""Test cpu graph mode without distribution strategies.""" """Test cpu graph mode without distribution strategies."""
self._setup() self._setup()
......
...@@ -205,24 +205,12 @@ def run(flags_obj): ...@@ -205,24 +205,12 @@ def run(flags_obj):
with strategy_scope: with strategy_scope:
optimizer = common.get_optimizer(lr_schedule) optimizer = common.get_optimizer(lr_schedule)
model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES) model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES)
model.compile(
# TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer loss='sparse_categorical_crossentropy',
# a valid arg for this model. Also remove as a valid flag. optimizer=optimizer,
if flags_obj.force_v2_in_keras_compile is not None: metrics=(['sparse_categorical_accuracy']
model.compile( if flags_obj.report_accuracy_metrics else None),
loss='sparse_categorical_crossentropy', run_eagerly=flags_obj.run_eagerly)
optimizer=optimizer,
metrics=(['sparse_categorical_accuracy']
if flags_obj.report_accuracy_metrics else None),
run_eagerly=flags_obj.run_eagerly,
experimental_run_tf_function=flags_obj.force_v2_in_keras_compile)
else:
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=optimizer,
metrics=(['sparse_categorical_accuracy']
if flags_obj.report_accuracy_metrics else None),
run_eagerly=flags_obj.run_eagerly)
train_epochs = flags_obj.train_epochs train_epochs = flags_obj.train_epochs
......
...@@ -142,25 +142,12 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -142,25 +142,12 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.early_stopping = True FLAGS.early_stopping = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_force_v1_path_early_stop(self):
self._setup()
FLAGS.early_stopping = True
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_early_stop(self): def benchmark_1_gpu_no_dist_strat_early_stop(self):
self._setup() self._setup()
FLAGS.distribution_strategy = 'off' FLAGS.distribution_strategy = 'off'
FLAGS.early_stopping = True FLAGS.early_stopping = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_force_v1_path_early_stop(self):
self._setup()
FLAGS.distribution_strategy = 'off'
FLAGS.early_stopping = True
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_early_stop(self): def benchmark_1_gpu_no_dist_strat_run_eagerly_early_stop(self):
self._setup() self._setup()
FLAGS.distribution_strategy = 'off' FLAGS.distribution_strategy = 'off'
...@@ -174,13 +161,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -174,13 +161,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.enable_xla = True FLAGS.enable_xla = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_xla_1_gpu_force_v1_path_early_stop(self):
self._setup()
FLAGS.early_stopping = True
FLAGS.enable_xla = True
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark()
def benchmark_1_gpu_ctl_early_stop(self): def benchmark_1_gpu_ctl_early_stop(self):
self._setup() self._setup()
FLAGS.keras_use_ctl = True FLAGS.keras_use_ctl = True
...@@ -233,14 +213,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -233,14 +213,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.train_epochs = 7 FLAGS.train_epochs = 7
self._run_and_report_benchmark_mlperf_like() self._run_and_report_benchmark_mlperf_like()
def benchmark_1_gpu_no_dist_strat_force_v1_path_mlperf_like(self):
"""1 GPU using compile/fit without dist_strat."""
self._setup()
FLAGS.train_epochs = 7
FLAGS.distribution_strategy = 'off'
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_mlperf_like(self): def benchmark_1_gpu_no_dist_strat_mlperf_like(self):
"""1 GPU using compile/fit without dist_strat.""" """1 GPU using compile/fit without dist_strat."""
self._setup() self._setup()
...@@ -353,20 +325,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -353,20 +325,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.epsilon = 1e-8 FLAGS.epsilon = 1e-8
self._run_and_report_benchmark_mlperf_like() self._run_and_report_benchmark_mlperf_like()
def benchmark_8_gpu_force_v1_path_mlperf_like(self):
"""8 GPU using keras fit/compile v1 codepath."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.train_epochs = 17
FLAGS.batch_size = 1048576
FLAGS.eval_batch_size = 160000
FLAGS.learning_rate = 0.0045
FLAGS.beta1 = 0.25
FLAGS.beta2 = 0.5
FLAGS.epsilon = 1e-8
FLAGS.force_v2_in_keras_compile = False
self._run_and_report_benchmark_mlperf_like()
def benchmark_8_gpu_ctl_mlperf_like(self): def benchmark_8_gpu_ctl_mlperf_like(self):
"""8 GPU using CTL.""" """8 GPU using CTL."""
self._setup() self._setup()
......
...@@ -150,7 +150,9 @@ def run_customized_training_loop( ...@@ -150,7 +150,9 @@ def run_customized_training_loop(
and model variables pairs as input, manipulate them, and returns a new and model variables pairs as input, manipulate them, and returns a new
gradients and model variables paris. The callback functions will be gradients and model variables paris. The callback functions will be
invoked in the list order and before gradients are allreduced. invoked in the list order and before gradients are allreduced.
Default is no callbacks. Only used when explicit_allreduce=True. With mixed precision training, the pre_allreduce_allbacks will be
applied on scaled_gradients. Default is no callbacks.
Only used when explicit_allreduce=True.
post_allreduce_callbacks: A list of callback functions that takes post_allreduce_callbacks: A list of callback functions that takes
gradients and model variables pairs as input, manipulate them, and gradients and model variables pairs as input, manipulate them, and
returns a new gradients and model variables paris. The callback returns a new gradients and model variables paris. The callback
...@@ -393,8 +395,8 @@ def run_customized_training_loop( ...@@ -393,8 +395,8 @@ def run_customized_training_loop(
train_steps(train_iterator, train_steps(train_iterator,
tf.convert_to_tensor(steps, dtype=tf.int32)) tf.convert_to_tensor(steps, dtype=tf.int32))
train_loss = _float_metric_value(train_loss_metric) train_loss = _float_metric_value(train_loss_metric)
_run_callbacks_on_batch_end(current_step, {'loss': train_loss})
current_step += steps current_step += steps
_run_callbacks_on_batch_end(current_step - 1, {'loss': train_loss})
# Updates training logging. # Updates training logging.
training_status = 'Train Step: %d/%d / loss = %s' % ( training_status = 'Train Step: %d/%d / loss = %s' % (
......
# TensorFlow Natural Language Processing Models # TensorFlow Natural Language Processing Modelling Toolkit
tensorflow/models/official/nlp is a library of state-of-the-art models for tensorflow/models/official/nlp provides a [modeling library](modeling) for constructing
Natural Language Processing (NLP). NLP model achitectures, as well as TF2 reference implementations for
state-of-the-art models.
The library currently contains TensorFlow 2.x implementations, pre-trained The repository contains the following models, with implementations, pre-trained
model weights, usage scripts and conversion utilities for the following models: model weights, usage scripts and conversion utilities:
* [Bert](bert) * [Bert](bert)
* [Albert](albert) * [Albert](albert)
* [XLNet](xlnet) * [XLNet](xlnet)
* [Transformer for translation](transformer) * [Transformer for translation](transformer)
Addtional features:
* Distributed trainable on both multi-GPU and TPU
* e2e training for custom models, including both pretraining and finetuning.
...@@ -18,6 +18,7 @@ from __future__ import absolute_import ...@@ -18,6 +18,7 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import gin
import tensorflow as tf import tensorflow as tf
import tensorflow_hub as hub import tensorflow_hub as hub
...@@ -85,16 +86,46 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer): ...@@ -85,16 +86,46 @@ class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
return final_loss return final_loss
def get_transformer_encoder(bert_config, sequence_length): @gin.configurable
def get_transformer_encoder(bert_config,
sequence_length,
transformer_encoder_cls=None):
"""Gets a 'TransformerEncoder' object. """Gets a 'TransformerEncoder' object.
Args: Args:
bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object. bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
sequence_length: Maximum sequence length of the training data. sequence_length: Maximum sequence length of the training data.
transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the
default BERT encoder implementation.
Returns: Returns:
A networks.TransformerEncoder object. A networks.TransformerEncoder object.
""" """
if transformer_encoder_cls is not None:
# TODO(hongkuny): evaluate if it is better to put cfg definition in gin.
embedding_cfg = dict(
vocab_size=bert_config.vocab_size,
type_vocab_size=bert_config.type_vocab_size,
hidden_size=bert_config.hidden_size,
seq_length=sequence_length,
max_seq_length=bert_config.max_position_embeddings,
initializer=tf.keras.initializers.TruncatedNormal(
stddev=bert_config.initializer_range),
dropout_rate=bert_config.hidden_dropout_prob,
)
hidden_cfg = dict(
num_attention_heads=bert_config.num_attention_heads,
intermediate_size=bert_config.intermediate_size,
intermediate_activation=tf_utils.get_activation(bert_config.hidden_act),
dropout_rate=bert_config.hidden_dropout_prob,
attention_dropout_rate=bert_config.attention_probs_dropout_prob,
)
kwargs = dict(embedding_cfg=embedding_cfg, hidden_cfg=hidden_cfg,
num_hidden_instances=bert_config.num_hidden_layers,)
# Relies on gin configuration to define the Transformer encoder arguments.
return transformer_encoder_cls(**kwargs)
kwargs = dict( kwargs = dict(
vocab_size=bert_config.vocab_size, vocab_size=bert_config.vocab_size,
hidden_size=bert_config.hidden_size, hidden_size=bert_config.hidden_size,
......
...@@ -20,6 +20,14 @@ import tensorflow as tf ...@@ -20,6 +20,14 @@ import tensorflow as tf
from official.utils.flags import core as flags_core from official.utils.flags import core as flags_core
def define_gin_flags():
"""Define common gin configurable flags."""
flags.DEFINE_multi_string('gin_file', None,
'List of paths to the config files.')
flags.DEFINE_multi_string(
'gin_param', None, 'Newline separated list of Gin parameter bindings.')
def define_common_bert_flags(): def define_common_bert_flags():
"""Define common flags for BERT tasks.""" """Define common flags for BERT tasks."""
flags_core.define_base( flags_core.define_base(
...@@ -69,8 +77,6 @@ def define_common_bert_flags(): ...@@ -69,8 +77,6 @@ def define_common_bert_flags():
flags.DEFINE_bool('hub_module_trainable', True, flags.DEFINE_bool('hub_module_trainable', True,
'True to make keras layers in the hub module trainable.') 'True to make keras layers in the hub module trainable.')
flags_core.define_log_steps()
# Adds flags for mixed precision and multi-worker training. # Adds flags for mixed precision and multi-worker training.
flags_core.define_performance( flags_core.define_performance(
num_parallel_calls=False, num_parallel_calls=False,
......
...@@ -169,7 +169,7 @@ def run_bert_classifier(strategy, ...@@ -169,7 +169,7 @@ def run_bert_classifier(strategy,
epochs, epochs,
steps_per_epoch, steps_per_epoch,
eval_steps, eval_steps,
custom_callbacks=custom_callbacks) custom_callbacks=None)
# Use user-defined loop to start training. # Use user-defined loop to start training.
logging.info('Training using customized training loop TF 2.0 with ' logging.info('Training using customized training loop TF 2.0 with '
...@@ -363,15 +363,6 @@ def run_bert(strategy, ...@@ -363,15 +363,6 @@ def run_bert(strategy,
if not strategy: if not strategy:
raise ValueError('Distribution strategy has not been specified.') raise ValueError('Distribution strategy has not been specified.')
if FLAGS.log_steps:
custom_callbacks = [keras_utils.TimeHistory(
batch_size=FLAGS.train_batch_size,
log_steps=FLAGS.log_steps,
logdir=FLAGS.model_dir,
)]
else:
custom_callbacks = None
trained_model = run_bert_classifier( trained_model = run_bert_classifier(
strategy, strategy,
model_config, model_config,
...@@ -387,8 +378,7 @@ def run_bert(strategy, ...@@ -387,8 +378,7 @@ def run_bert(strategy,
train_input_fn, train_input_fn,
eval_input_fn, eval_input_fn,
run_eagerly=FLAGS.run_eagerly, run_eagerly=FLAGS.run_eagerly,
use_keras_compile_fit=FLAGS.use_keras_compile_fit, use_keras_compile_fit=FLAGS.use_keras_compile_fit)
custom_callbacks=custom_callbacks)
if FLAGS.model_export_path: if FLAGS.model_export_path:
# As Keras ModelCheckpoint callback used with Keras compile/fit() API # As Keras ModelCheckpoint callback used with Keras compile/fit() API
......
...@@ -20,6 +20,7 @@ from __future__ import print_function ...@@ -20,6 +20,7 @@ from __future__ import print_function
from absl import app from absl import app
from absl import flags from absl import flags
from absl import logging from absl import logging
import gin
import tensorflow as tf import tensorflow as tf
from official.modeling import model_training_utils from official.modeling import model_training_utils
...@@ -49,6 +50,7 @@ flags.DEFINE_float('warmup_steps', 10000, ...@@ -49,6 +50,7 @@ flags.DEFINE_float('warmup_steps', 10000,
'Warmup steps for Adam weight decay optimizer.') 'Warmup steps for Adam weight decay optimizer.')
common_flags.define_common_bert_flags() common_flags.define_common_bert_flags()
common_flags.define_gin_flags()
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
...@@ -158,7 +160,7 @@ def run_bert_pretrain(strategy): ...@@ -158,7 +160,7 @@ def run_bert_pretrain(strategy):
def main(_): def main(_):
# Users should always run this script under TF 2.x # Users should always run this script under TF 2.x
assert tf.version.VERSION.startswith('2.') assert tf.version.VERSION.startswith('2.')
gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
if not FLAGS.model_dir: if not FLAGS.model_dir:
FLAGS.model_dir = '/tmp/bert20/' FLAGS.model_dir = '/tmp/bert20/'
strategy = distribution_utils.get_distribution_strategy( strategy = distribution_utils.get_distribution_strategy(
......
...@@ -29,7 +29,6 @@ from official.nlp.bert import run_squad_helper ...@@ -29,7 +29,6 @@ from official.nlp.bert import run_squad_helper
from official.nlp.bert import tokenization from official.nlp.bert import tokenization
from official.nlp.data import squad_lib as squad_lib_wp from official.nlp.data import squad_lib as squad_lib_wp
from official.utils.misc import distribution_utils from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils
flags.DEFINE_string('vocab_file', None, flags.DEFINE_string('vocab_file', None,
...@@ -95,21 +94,7 @@ def main(_): ...@@ -95,21 +94,7 @@ def main(_):
all_reduce_alg=FLAGS.all_reduce_alg, all_reduce_alg=FLAGS.all_reduce_alg,
tpu_address=FLAGS.tpu) tpu_address=FLAGS.tpu)
if FLAGS.mode in ('train', 'train_and_predict'): if FLAGS.mode in ('train', 'train_and_predict'):
if FLAGS.log_steps: train_squad(strategy, input_meta_data, run_eagerly=FLAGS.run_eagerly)
custom_callbacks = [keras_utils.TimeHistory(
batch_size=FLAGS.train_batch_size,
log_steps=FLAGS.log_steps,
logdir=FLAGS.model_dir,
)]
else:
custom_callbacks = None
train_squad(
strategy,
input_meta_data,
custom_callbacks=custom_callbacks,
run_eagerly=FLAGS.run_eagerly,
)
if FLAGS.mode in ('predict', 'train_and_predict'): if FLAGS.mode in ('predict', 'train_and_predict'):
predict_squad(strategy, input_meta_data) predict_squad(strategy, input_meta_data)
......
...@@ -269,11 +269,10 @@ def train_squad(strategy, ...@@ -269,11 +269,10 @@ def train_squad(strategy,
loss_factor=1.0 / loss_factor=1.0 /
strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0) strategy.num_replicas_in_sync if FLAGS.scale_loss else 1.0)
# when all_reduce_sum_gradients = False, apply_gradients() no longer # If explicit_allreduce = True, apply_gradients() no longer implicitly
# implicitly allreduce gradients, users manually allreduce gradient and # allreduce gradients, users manually allreduce gradient and pass the
# passed the allreduced grads_and_vars. For now, the clip_by_global_norm # allreduced grads_and_vars to apply_gradients(). clip_by_global_norm will be
# will be moved to before users' manual allreduce to keep the math # applied to allreduced gradients.
# unchanged.
def clip_by_global_norm_callback(grads_and_vars): def clip_by_global_norm_callback(grads_and_vars):
grads, variables = zip(*grads_and_vars) grads, variables = zip(*grads_and_vars)
(clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) (clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
...@@ -291,8 +290,8 @@ def train_squad(strategy, ...@@ -291,8 +290,8 @@ def train_squad(strategy,
init_checkpoint=FLAGS.init_checkpoint, init_checkpoint=FLAGS.init_checkpoint,
run_eagerly=run_eagerly, run_eagerly=run_eagerly,
custom_callbacks=custom_callbacks, custom_callbacks=custom_callbacks,
explicit_allreduce=True, explicit_allreduce=False,
pre_allreduce_callbacks=[clip_by_global_norm_callback]) post_allreduce_callbacks=[clip_by_global_norm_callback])
def predict_squad(strategy, input_meta_data, tokenizer, bert_config, squad_lib): def predict_squad(strategy, input_meta_data, tokenizer, bert_config, squad_lib):
......
# NLP Modeling Library
This libary provides a set of Keras primitives (Layers, Networks, and Models)
that can be assembled into transformer-based models. They are
flexible, validated, interoperable, and both TF1 and TF2 compatible.
* [`layers`](layers) are the fundamental building blocks for NLP models.
They can be used to assemble new layers, networks, or models.
* [`networks`](networks) are combinations of layers (and possibly other networks). They are sub-units of models that would not be trained alone. They
encapsulate common network structures like a classification head
or a transformer encoder into an easily handled object with a
standardized configuration.
* [`models`](models) are combinations of layers and networks that would be trained. Pre-built canned models are provided as both convenience functions and canonical examples.
* [`losses`](losses) contains common loss computation used in NLP tasks.
Besides the pre-defined primitives, it also provides scaffold classes to allow
easy experimentation with noval achitectures, e.g., you don’t need to fork a whole Transformer object to try a different kind of attention primitive, for instance.
* [`TransformerScaffold`](layers/transformer_scaffold.py) implements the
Transformer from ["Attention Is All You Need"]
(https://arxiv.org/abs/1706.03762), with a customizable attention layer
option. Users can pass a class to `attention_cls` and associated config to
`attention_cfg`, in which case the scaffold will instantiate the class with
the config, or pass a class instance to `attention_cls`.
* [`EncoderScaffold`](networks/encoder_scaffold.py) implements the transformer
encoder from ["BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding"](https://arxiv.org/abs/1810.04805), with customizable
embedding subnetwork (which will replace the standard embedding logic) and/or a
custom hidden layer (which will replace the Transformer instantiation in the
encoder).
BERT and ALBERT models in this repo are implemented using this library. Code examples can be found in the corresponding model folder.
# Layers
Layers are the fundamental building blocks for NLP models. They can be used to
assemble new layers, networks, or models.
* [DenseEinsum](dense_einsum.py) implements a feedforward network using tf.einsum. This layer contains the einsum op, the associated weight, and the
logic required to generate the einsum expression for the given initialization
parameters.
* [Attention](attention.py) implements an optionally masked attention between two tensors, from_tensor and to_tensor, as described in ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762). If `from_tensor` and `to_tensor` are the same, then this is self-attention.
* [CachedAttention](attention.py) implements an attention layer with cache used
for auto-agressive decoding.
* [Transformer](transformer.py) implements an optionally masked transformer as
described in ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
* [OnDeviceEmbedding](on_device_embedding.py) implements efficient embedding lookups designed for TPU-based models.
* [PositionalEmbedding](position_embedding.py) creates a positional embedding
as described in ["BERT: Pre-training
of Deep Bidirectional Transformers for Language Understanding"]
(https://arxiv.org/abs/1810.04805).
* [SelfAttentionMask](self_attention_mask.py) creates a 3D attention mask from a 2D tensor mask.
* [MaskedSoftmax](masked_softmax.py) implements a softmax with an optional masking input. If no mask is provided to this layer, it performs a standard softmax; however, if a mask tensor is applied (which should be 1 in positions where the data should be allowed through, and 0 where the data should be masked), the output will have masked positions set to approximately zero.
...@@ -19,6 +19,7 @@ from __future__ import division ...@@ -19,6 +19,7 @@ from __future__ import division
# from __future__ import google_type_annotations # from __future__ import google_type_annotations
from __future__ import print_function from __future__ import print_function
import gin
import tensorflow as tf import tensorflow as tf
from official.nlp.modeling.layers import attention from official.nlp.modeling.layers import attention
...@@ -26,6 +27,7 @@ from official.nlp.modeling.layers import dense_einsum ...@@ -26,6 +27,7 @@ from official.nlp.modeling.layers import dense_einsum
@tf.keras.utils.register_keras_serializable(package="Text") @tf.keras.utils.register_keras_serializable(package="Text")
@gin.configurable
class TransformerScaffold(tf.keras.layers.Layer): class TransformerScaffold(tf.keras.layers.Layer):
"""Transformer scaffold layer. """Transformer scaffold layer.
......
# Losses
Losses contains common loss computation used in NLP tasks.
* `weighted_sparse_categorical_crossentropy_loss` computes per-batch sparse
categorical crossentropy loss.
* `weighted_sparse_categorical_crossentropy_per_example_loss` computes
per-example sparse categorical crossentropy loss.
# Models
Models are combinations of layers and networks that would be trained.
Several pre-built canned models are provided to train encoder networks. These
models are intended as both convenience functions and canonical examples.
* [`BertClassifier`](bert_classifier.py) implements a simple classification
model containing a single classification head using the Classification network.
* [`BertSpanLabeler`](bert_span_labeler.py) implementats a simple single-span
start-end predictor (that is, a model that predicts two values: a start token
index and an end token index), suitable for SQuAD-style tasks.
* [`BertPretrainer`](bert_pretrainer.py) implements a masked LM and a
classification head using the Masked LM and Classification networks,
respectively.
# Networks
Networks are combinations of layers (and possibly other networks). They are sub-units of models that would not be trained alone. It
encapsulates common network structures like a classification head
or a transformer encoder into an easily handled object with a
standardized configuration.
* [`TransformerEncoder`](transformer_encoder.py) implements a bi-directional
Transformer-based encoder as described in ["BERT: Pre-training of Deep
Bidirectional Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding lookups,
transformer layers and pooling layer.
* [`AlbertTransformerEncoder`](albert_transformer_encoder.py) implements a
Transformer-encoder described in the paper ["ALBERT: A Lite BERT for
Self-supervised Learning of Language Representations]
(https://arxiv.org/abs/1909.11942). Compared with [BERT](https://arxiv.org/abs/1810.04805), ALBERT refactorizes embedding parameters
into two smaller matrices and shares parameters across layers.
* [`MaskedLM`](masked_lm.py) implements a masked language model for BERT pretraining. It assumes that the network being passed has a `get_embedding_table()` method.
* [`Classification`](classification.py) contains a single hidden layer, and is intended for use as a classification head.
* [`SpanLabeling`](span_labeling.py) implements a single-span labeler (that is, a prediction head that can predict one start and end index per batch item) based on a single dense hidden layer. It can be used in the SQuAD task.
# Lint as: python3
# Copyright 2019 The TensorFlow Authors. All Rights Reserved. # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -20,6 +21,8 @@ from __future__ import division ...@@ -20,6 +21,8 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import inspect import inspect
import gin
import tensorflow as tf import tensorflow as tf
from tensorflow.python.keras.engine import network # pylint: disable=g-direct-tensorflow-import from tensorflow.python.keras.engine import network # pylint: disable=g-direct-tensorflow-import
...@@ -27,6 +30,7 @@ from official.nlp.modeling import layers ...@@ -27,6 +30,7 @@ from official.nlp.modeling import layers
@tf.keras.utils.register_keras_serializable(package='Text') @tf.keras.utils.register_keras_serializable(package='Text')
@gin.configurable
class EncoderScaffold(network.Network): class EncoderScaffold(network.Network):
"""Bi-directional Transformer-based encoder network scaffold. """Bi-directional Transformer-based encoder network scaffold.
...@@ -96,7 +100,6 @@ class EncoderScaffold(network.Network): ...@@ -96,7 +100,6 @@ class EncoderScaffold(network.Network):
hidden_cls=layers.Transformer, hidden_cls=layers.Transformer,
hidden_cfg=None, hidden_cfg=None,
**kwargs): **kwargs):
print(embedding_cfg)
self._self_setattr_tracking = False self._self_setattr_tracking = False
self._hidden_cls = hidden_cls self._hidden_cls = hidden_cls
self._hidden_cfg = hidden_cfg self._hidden_cfg = hidden_cfg
...@@ -171,7 +174,8 @@ class EncoderScaffold(network.Network): ...@@ -171,7 +174,8 @@ class EncoderScaffold(network.Network):
for _ in range(num_hidden_instances): for _ in range(num_hidden_instances):
if inspect.isclass(hidden_cls): if inspect.isclass(hidden_cls):
layer = self._hidden_cls(**hidden_cfg) layer = self._hidden_cls(
**hidden_cfg) if hidden_cfg else self._hidden_cls()
else: else:
layer = self._hidden_cls layer = self._hidden_cls
data = layer([data, attention_mask]) data = layer([data, attention_mask])
......
...@@ -23,16 +23,18 @@ import random ...@@ -23,16 +23,18 @@ import random
import tarfile import tarfile
# pylint: disable=g-bad-import-order # pylint: disable=g-bad-import-order
import six
from six.moves import urllib
from absl import app as absl_app from absl import app as absl_app
from absl import flags from absl import flags
from absl import logging from absl import logging
import six
from six.moves import range
from six.moves import urllib
from six.moves import zip
import tensorflow.compat.v1 as tf import tensorflow.compat.v1 as tf
# pylint: enable=g-bad-import-order
from official.nlp.transformer.utils import tokenizer from official.nlp.transformer.utils import tokenizer
from official.utils.flags import core as flags_core from official.utils.flags import core as flags_core
# pylint: enable=g-bad-import-order
# Data sources for training/evaluating the transformer translation model. # Data sources for training/evaluating the transformer translation model.
# If any of the training sources are changed, then either: # If any of the training sources are changed, then either:
...@@ -148,7 +150,7 @@ def download_report_hook(count, block_size, total_size): ...@@ -148,7 +150,7 @@ def download_report_hook(count, block_size, total_size):
total_size: total size total_size: total size
""" """
percent = int(count * block_size * 100 / total_size) percent = int(count * block_size * 100 / total_size)
print("\r%d%%" % percent + " completed", end="\r") print(six.ensure_str("\r%d%%" % percent) + " completed", end="\r")
def download_from_url(path, url): def download_from_url(path, url):
...@@ -161,12 +163,12 @@ def download_from_url(path, url): ...@@ -161,12 +163,12 @@ def download_from_url(path, url):
Returns: Returns:
Full path to downloaded file Full path to downloaded file
""" """
filename = url.split("/")[-1] filename = six.ensure_str(url).split("/")[-1]
found_file = find_file(path, filename, max_depth=0) found_file = find_file(path, filename, max_depth=0)
if found_file is None: if found_file is None:
filename = os.path.join(path, filename) filename = os.path.join(path, filename)
logging.info("Downloading from %s to %s." % (url, filename)) logging.info("Downloading from %s to %s." % (url, filename))
inprogress_filepath = filename + ".incomplete" inprogress_filepath = six.ensure_str(filename) + ".incomplete"
inprogress_filepath, _ = urllib.request.urlretrieve( inprogress_filepath, _ = urllib.request.urlretrieve(
url, inprogress_filepath, reporthook=download_report_hook) url, inprogress_filepath, reporthook=download_report_hook)
# Print newline to clear the carriage return from the download progress. # Print newline to clear the carriage return from the download progress.
...@@ -242,8 +244,10 @@ def compile_files(raw_dir, raw_files, tag): ...@@ -242,8 +244,10 @@ def compile_files(raw_dir, raw_files, tag):
""" """
logging.info("Compiling files with tag %s." % tag) logging.info("Compiling files with tag %s." % tag)
filename = "%s-%s" % (_PREFIX, tag) filename = "%s-%s" % (_PREFIX, tag)
input_compiled_file = os.path.join(raw_dir, filename + ".lang1") input_compiled_file = os.path.join(raw_dir,
target_compiled_file = os.path.join(raw_dir, filename + ".lang2") six.ensure_str(filename) + ".lang1")
target_compiled_file = os.path.join(raw_dir,
six.ensure_str(filename) + ".lang2")
with tf.io.gfile.GFile(input_compiled_file, mode="w") as input_writer: with tf.io.gfile.GFile(input_compiled_file, mode="w") as input_writer:
with tf.io.gfile.GFile(target_compiled_file, mode="w") as target_writer: with tf.io.gfile.GFile(target_compiled_file, mode="w") as target_writer:
...@@ -295,7 +299,7 @@ def encode_and_save_files( ...@@ -295,7 +299,7 @@ def encode_and_save_files(
target_file = raw_files[1] target_file = raw_files[1]
# Write examples to each shard in round robin order. # Write examples to each shard in round robin order.
tmp_filepaths = [fname + ".incomplete" for fname in filepaths] tmp_filepaths = [six.ensure_str(fname) + ".incomplete" for fname in filepaths]
writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filepaths] writers = [tf.python_io.TFRecordWriter(fname) for fname in tmp_filepaths]
counter, shard = 0, 0 counter, shard = 0, 0
for counter, (input_line, target_line) in enumerate(zip( for counter, (input_line, target_line) in enumerate(zip(
...@@ -328,7 +332,7 @@ def shuffle_records(fname): ...@@ -328,7 +332,7 @@ def shuffle_records(fname):
logging.info("Shuffling records in file %s" % fname) logging.info("Shuffling records in file %s" % fname)
# Rename file prior to shuffling # Rename file prior to shuffling
tmp_fname = fname + ".unshuffled" tmp_fname = six.ensure_str(fname) + ".unshuffled"
tf.gfile.Rename(fname, tmp_fname) tf.gfile.Rename(fname, tmp_fname)
reader = tf.io.tf_record_iterator(tmp_fname) reader = tf.io.tf_record_iterator(tmp_fname)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment