resovle merge conflicts

31ca3b97 · Kaushik Shivakumar · 3e9d886d · 7fcd7cba · 31ca3b97 · 31ca3b97
Commit 31ca3b97 authored Jul 23, 2020 by Kaushik Shivakumar
20 changed files
--- a/official/nlp/modeling/models/bert_classifier.py
+++ b/official/nlp/modeling/models/bert_classifier.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
+from official.nlp.modeling import layers
 from official.nlp.modeling import networks
@@ -36,6 +37,9 @@ class BertClassifier(tf.keras.Model):
  instantiates a classification network based on the passed `num_classes`
  argument. If `num_classes` is set to 1, a regression network is instantiated.
+  *Note* that the model is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
  Arguments:
    network: A transformer network. This network should output a sequence output
      and a classification output. Furthermore, it should expose its embedding
@@ -43,23 +47,25 @@ class BertClassifier(tf.keras.Model):
    num_classes: Number of classes to predict from the classification network.
    initializer: The initializer (if any) to use in the classification networks.
      Defaults to a Glorot uniform initializer.
-    output: The output style for this network. Can be either 'logits' or
+    dropout_rate: The dropout probability of the cls head.
-      'predictions'.
+    use_encoder_pooler: Whether to use the pooler layer pre-defined inside
+      the encoder.
  """
  def __init__(self,
               network,
               num_classes,
               initializer='glorot_uniform',
-               output='logits',
               dropout_rate=0.1,
+               use_encoder_pooler=True,
               **kwargs):
    self._self_setattr_tracking = False
+    self._network = network
    self._config = {
        'network': network,
        'num_classes': num_classes,
        'initializer': initializer,
-        'output': output,
+        'use_encoder_pooler': use_encoder_pooler,
    }
    # We want to use the inputs of the passed network as the inputs to this
@@ -67,22 +73,36 @@ class BertClassifier(tf.keras.Model):
    # when we construct the Model object at the end of init.
    inputs = network.inputs
-    # Because we have a copy of inputs to create this Model object, we can
+    if use_encoder_pooler:
-    # invoke the Network object with its own input tensors to start the Model.
+      # Because we have a copy of inputs to create this Model object, we can
-    _, cls_output = network(inputs)
+      # invoke the Network object with its own input tensors to start the Model.
-    cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)
+      _, cls_output = network(inputs)
+      cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)
-    self.classifier = networks.Classification(
+      self.classifier = networks.Classification(
-        input_width=cls_output.shape[-1],
+          input_width=cls_output.shape[-1],
-        num_classes=num_classes,
+          num_classes=num_classes,
-        initializer=initializer,
+          initializer=initializer,
-        output=output,
+          output='logits',
-        name='classification')
+          name='sentence_prediction')
-    predictions = self.classifier(cls_output)
+      predictions = self.classifier(cls_output)
+    else:
+      sequence_output, _ = network(inputs)
+      self.classifier = layers.ClassificationHead(
+          inner_dim=sequence_output.shape[-1],
+          num_classes=num_classes,
+          initializer=initializer,
+          dropout_rate=dropout_rate,
+          name='sentence_prediction')
+      predictions = self.classifier(sequence_output)
    super(BertClassifier, self).__init__(
        inputs=inputs, outputs=predictions, **kwargs)
+  @property
+  def checkpoint_items(self):
+    return dict(encoder=self._network)
  def get_config(self):
    return self._config

--- a/official/nlp/modeling/models/bert_classifier_test.py
+++ b/official/nlp/modeling/models/bert_classifier_test.py
@@ -42,8 +42,7 @@ class BertClassifierTest(keras_parameterized.TestCase):
    # Create a BERT trainer with the created network.
    bert_trainer_model = bert_classifier.BertClassifier(
-        test_network,
+        test_network, num_classes=num_classes)
-        num_classes=num_classes)
    # Create a set of 2-dimensional inputs (the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -89,7 +88,7 @@ class BertClassifierTest(keras_parameterized.TestCase):
    # Create a BERT trainer with the created network. (Note that all the args
    # are different, so we can catch any serialization mismatches.)
    bert_trainer_model = bert_classifier.BertClassifier(
-        test_network, num_classes=4, initializer='zeros', output='predictions')
+        test_network, num_classes=4, initializer='zeros')
    # Create another BERT trainer via serialization and deserialization.
    config = bert_trainer_model.get_config()

--- a/official/nlp/modeling/models/bert_pretrainer.py
+++ b/official/nlp/modeling/models/bert_pretrainer.py
@@ -41,6 +41,9 @@ class BertPretrainer(tf.keras.Model):
  instantiates the masked language model and classification networks that are
  used to create the training objectives.
+  *Note* that the model is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
  Arguments:
    network: A transformer network. This network should output a sequence output
      and a classification output.
@@ -147,11 +150,9 @@ class BertPretrainerV2(tf.keras.Model):
  (Experimental).
  Adds the masked language model head and optional classification heads upon the
-  transformer encoder. When num_masked_tokens == 0, there won't be MaskedLM
+  transformer encoder.
-  head.
  Arguments:
-    num_masked_tokens: Number of tokens to predict from the masked LM.
    encoder_network: A transformer network. This network should output a
      sequence output and a classification output.
    mlm_activation: The activation (if any) to use in the masked LM network. If
@@ -169,7 +170,6 @@ class BertPretrainerV2(tf.keras.Model):
  def __init__(
      self,
-      num_masked_tokens: int,
      encoder_network: tf.keras.Model,
      mlm_activation=None,
      mlm_initializer='glorot_uniform',
@@ -179,7 +179,6 @@ class BertPretrainerV2(tf.keras.Model):
    self._self_setattr_tracking = False
    self._config = {
        'encoder_network': encoder_network,
-        'num_masked_tokens': num_masked_tokens,
        'mlm_initializer': mlm_initializer,
        'classification_heads': classification_heads,
        'name': name,
@@ -195,19 +194,16 @@ class BertPretrainerV2(tf.keras.Model):
      raise ValueError('Classification heads should have unique names.')
    outputs = dict()
-    if num_masked_tokens > 0:
+    self.masked_lm = layers.MaskedLM(
-      self.masked_lm = layers.MaskedLM(
+        embedding_table=self.encoder_network.get_embedding_table(),
-          embedding_table=self.encoder_network.get_embedding_table(),
+        activation=mlm_activation,
-          activation=mlm_activation,
+        initializer=mlm_initializer,
-          initializer=mlm_initializer,
+        name='cls/predictions')
-          name='cls/predictions')
+    masked_lm_positions = tf.keras.layers.Input(
-      masked_lm_positions = tf.keras.layers.Input(
+        shape=(None,), name='masked_lm_positions', dtype=tf.int32)
-          shape=(num_masked_tokens,),
+    inputs.append(masked_lm_positions)
-          name='masked_lm_positions',
+    outputs['lm_output'] = self.masked_lm(
-          dtype=tf.int32)
+        sequence_output, masked_positions=masked_lm_positions)
-      inputs.append(masked_lm_positions)
-      outputs['lm_output'] = self.masked_lm(
-          sequence_output, masked_positions=masked_lm_positions)
    for cls_head in self.classification_heads:
      outputs[cls_head.name] = cls_head(sequence_output)
@@ -217,7 +213,7 @@ class BertPretrainerV2(tf.keras.Model):
  @property
  def checkpoint_items(self):
    """Returns a dictionary of items to be additionally checkpointed."""
-    items = dict(encoder=self.encoder_network)
+    items = dict(encoder=self.encoder_network, masked_lm=self.masked_lm)
    for head in self.classification_heads:
      for key, item in head.checkpoint_items.items():
        items['.'.join([head.name, key])] = item

--- a/official/nlp/modeling/models/bert_pretrainer_test.py
+++ b/official/nlp/modeling/models/bert_pretrainer_test.py
@@ -118,10 +118,9 @@ class BertPretrainerTest(keras_parameterized.TestCase):
        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)
    # Create a BERT trainer with the created network.
-    num_token_predictions = 2
    bert_trainer_model = bert_pretrainer.BertPretrainerV2(
-        encoder_network=test_network, num_masked_tokens=num_token_predictions)
+        encoder_network=test_network)
+    num_token_predictions = 20
    # Create a set of 2-dimensional inputs (the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -145,7 +144,7 @@ class BertPretrainerTest(keras_parameterized.TestCase):
    # Create a BERT trainer with the created network. (Note that all the args
    # are different, so we can catch any serialization mismatches.)
    bert_trainer_model = bert_pretrainer.BertPretrainerV2(
-        encoder_network=test_network, num_masked_tokens=2)
+        encoder_network=test_network)
    # Create another BERT trainer via serialization and deserialization.
    config = bert_trainer_model.get_config()

--- a/official/nlp/modeling/models/bert_span_labeler.py
+++ b/official/nlp/modeling/models/bert_span_labeler.py
@@ -32,9 +32,12 @@ class BertSpanLabeler(tf.keras.Model):
  encoder as described in "BERT: Pre-training of Deep Bidirectional Transformers
  for Language Understanding" (https://arxiv.org/abs/1810.04805).
-  The BertSpanLabeler allows a user to pass in a transformer stack, and
+  The BertSpanLabeler allows a user to pass in a transformer encoder, and
  instantiates a span labeling network based on a single dense layer.
+  *Note* that the model is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
  Arguments:
    network: A transformer network. This network should output a sequence output
      and a classification output. Furthermore, it should expose its embedding

--- a/official/nlp/modeling/models/bert_token_classifier.py
+++ b/official/nlp/modeling/models/bert_token_classifier.py
@@ -36,6 +36,9 @@ class BertTokenClassifier(tf.keras.Model):
  instantiates a token classification network based on the passed `num_classes`
  argument.
+  *Note* that the model is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
  Arguments:
    network: A transformer network. This network should output a sequence output
      and a classification output. Furthermore, it should expose its embedding

--- a/official/nlp/modeling/models/electra_pretrainer.py
+++ b/official/nlp/modeling/models/electra_pretrainer.py
@@ -39,6 +39,9 @@ class ElectraPretrainer(tf.keras.Model):
  model (at generator side) and classification networks (at discriminator side)
  that are used to create the training objectives.
+  *Note* that the model is constructed by Keras Subclass API, where layers are
+  defined inside __init__ and call() implements the computation.
  Arguments:
    generator_network: A transformer network for generator, this network should
      output a sequence output and an optional classification output.
@@ -48,7 +51,6 @@ class ElectraPretrainer(tf.keras.Model):
    num_classes: Number of classes to predict from the classification network
      for the generator network (not used now)
    sequence_length: Input sequence length
-    last_hidden_dim: Last hidden dim of generator transformer output
    num_token_predictions: Number of tokens to predict from the masked LM.
    mlm_activation: The activation (if any) to use in the masked LM and
      classification networks. If None, no activation will be used.
@@ -66,7 +68,6 @@ class ElectraPretrainer(tf.keras.Model):
               vocab_size,
               num_classes,
               sequence_length,
-               last_hidden_dim,
               num_token_predictions,
               mlm_activation=None,
               mlm_initializer='glorot_uniform',
@@ -80,7 +81,6 @@ class ElectraPretrainer(tf.keras.Model):
        'vocab_size': vocab_size,
        'num_classes': num_classes,
        'sequence_length': sequence_length,
-        'last_hidden_dim': last_hidden_dim,
        'num_token_predictions': num_token_predictions,
        'mlm_activation': mlm_activation,
        'mlm_initializer': mlm_initializer,
@@ -95,7 +95,6 @@ class ElectraPretrainer(tf.keras.Model):
    self.vocab_size = vocab_size
    self.num_classes = num_classes
    self.sequence_length = sequence_length
-    self.last_hidden_dim = last_hidden_dim
    self.num_token_predictions = num_token_predictions
    self.mlm_activation = mlm_activation
    self.mlm_initializer = mlm_initializer
@@ -108,14 +107,35 @@ class ElectraPretrainer(tf.keras.Model):
        output=output_type,
        name='generator_masked_lm')
    self.classification = layers.ClassificationHead(
-        inner_dim=last_hidden_dim,
+        inner_dim=generator_network._config_dict['hidden_size'],
        num_classes=num_classes,
        initializer=mlm_initializer,
        name='generator_classification_head')
+    self.discriminator_projection = tf.keras.layers.Dense(
+        units=discriminator_network._config_dict['hidden_size'],
+        activation=mlm_activation,
+        kernel_initializer=mlm_initializer,
+        name='discriminator_projection_head')
    self.discriminator_head = tf.keras.layers.Dense(
        units=1, kernel_initializer=mlm_initializer)
  def call(self, inputs):
+    """ELECTRA forward pass.
+    Args:
+      inputs: A dict of all inputs, same as the standard BERT model.
+    Returns:
+      outputs: A dict of pretrainer model outputs, including
+        (1) lm_outputs: a [batch_size, num_token_predictions, vocab_size] tensor
+        indicating logits on masked positions.
+        (2) sentence_outputs: a [batch_size, num_classes] tensor indicating
+        logits for nsp task.
+        (3) disc_logits: a [batch_size, sequence_length] tensor indicating
+        logits for discriminator replaced token detection task.
+        (4) disc_label: a [batch_size, sequence_length] tensor indicating
+        target labels for discriminator replaced token detection task.
+    """
    input_word_ids = inputs['input_word_ids']
    input_mask = inputs['input_mask']
    input_type_ids = inputs['input_type_ids']
@@ -149,10 +169,18 @@ class ElectraPretrainer(tf.keras.Model):
    if isinstance(disc_sequence_output, list):
      disc_sequence_output = disc_sequence_output[-1]
-    disc_logits = self.discriminator_head(disc_sequence_output)
+    disc_logits = self.discriminator_head(
+        self.discriminator_projection(disc_sequence_output))
    disc_logits = tf.squeeze(disc_logits, axis=-1)
-    return lm_outputs, sentence_outputs, disc_logits, disc_label
+    outputs = {
+        'lm_outputs': lm_outputs,
+        'sentence_outputs': sentence_outputs,
+        'disc_logits': disc_logits,
+        'disc_label': disc_label,
+    }
+    return outputs
  def _get_fake_data(self, inputs, mlm_logits, duplicate=True):
    """Generate corrupted data for discriminator.
@@ -191,6 +219,12 @@ class ElectraPretrainer(tf.keras.Model):
        'sampled_tokens': sampled_tokens
    }
+  @property
+  def checkpoint_items(self):
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(encoder=self.discriminator_network)
+    return items
  def get_config(self):
    return self._config

--- a/official/nlp/modeling/models/electra_pretrainer_test.py
+++ b/official/nlp/modeling/models/electra_pretrainer_test.py
@@ -49,7 +49,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
        vocab_size=vocab_size,
        num_classes=num_classes,
        sequence_length=sequence_length,
-        last_hidden_dim=768,
        num_token_predictions=num_token_predictions,
        disallow_correct=True)
@@ -69,7 +68,11 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
    }
    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    lm_outs, cls_outs, disc_logits, disc_label = eletrca_trainer_model(inputs)
+    outputs = eletrca_trainer_model(inputs)
+    lm_outs = outputs['lm_outputs']
+    cls_outs = outputs['sentence_outputs']
+    disc_logits = outputs['disc_logits']
+    disc_label = outputs['disc_label']
    # Validate that the outputs are of the expected shape.
    expected_lm_shape = [None, num_token_predictions, vocab_size]
@@ -97,7 +100,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
        vocab_size=100,
        num_classes=2,
        sequence_length=3,
-        last_hidden_dim=768,
        num_token_predictions=2)
    # Create a set of 2-dimensional data tensors to feed into the model.
@@ -117,7 +119,7 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
    # Invoke the trainer model on the tensors. In Eager mode, this does the
    # actual calculation. (We can't validate the outputs, since the network is
    # too complex: this simply ensures we're not hitting runtime errors.)
-    _, _, _, _ = eletrca_trainer_model(inputs)
+    _ = eletrca_trainer_model(inputs)
  def test_serialize_deserialize(self):
    """Validate that the ELECTRA trainer can be serialized and deserialized."""
@@ -136,7 +138,6 @@ class ElectraPretrainerTest(keras_parameterized.TestCase):
        vocab_size=100,
        num_classes=2,
        sequence_length=3,
-        last_hidden_dim=768,
        num_token_predictions=2)
    # Create another BERT trainer via serialization and deserialization.

--- a/official/nlp/modeling/networks/albert_transformer_encoder.py
+++ b/official/nlp/modeling/networks/albert_transformer_encoder.py
@@ -40,6 +40,8 @@ class AlbertTransformerEncoder(tf.keras.Model):
  The default values for this object are taken from the ALBERT-Base
  implementation described in the paper.
+  *Note* that the network is constructed by Keras Functional API.
  Arguments:
    vocab_size: The size of the token vocabulary.
    embedding_width: The width of the word embeddings. If the embedding width is

--- a/official/nlp/modeling/networks/classification.py
+++ b/official/nlp/modeling/networks/classification.py
@@ -29,6 +29,9 @@ class Classification(tf.keras.Model):
  This network implements a simple classifier head based on a dense layer. If
  num_classes is one, it can be considered as a regression problem.
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
  Arguments:
    input_width: The innermost dimension of the input tensor to this network.
    num_classes: The number of classes that this network should classify to. If

--- a/official/nlp/modeling/networks/encoder_scaffold.py
+++ b/official/nlp/modeling/networks/encoder_scaffold.py
@@ -49,6 +49,9 @@ class EncoderScaffold(tf.keras.Model):
  If the hidden_cls is not overridden, a default transformer layer will be
  instantiated.
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
  Arguments:
    pooled_output_dim: The dimension of pooled output.
    pooler_layer_initializer: The initializer for the classification

--- a/official/nlp/modeling/networks/encoder_scaffold_test.py
+++ b/official/nlp/modeling/networks/encoder_scaffold_test.py
@@ -323,6 +323,28 @@ class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
    self.assertAllEqual(network.get_config(), new_network.get_config())
+class Embeddings(tf.keras.Model):
+  def __init__(self, vocab_size, hidden_size):
+    super().__init__()
+    self.inputs = [
+        tf.keras.layers.Input(
+            shape=(None,), dtype=tf.int32, name="input_word_ids"),
+        tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
+    ]
+    self.attention_mask = layers.SelfAttentionMask()
+    self.embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=hidden_size,
+        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+        name="word_embeddings")
+  def call(self, inputs):
+    word_ids, mask = inputs
+    word_embeddings = self.embedding_layer(word_ids)
+    return word_embeddings, self.attention_mask([word_embeddings, mask])
 @keras_parameterized.run_all_keras_modes
 class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
@@ -334,20 +356,7 @@ class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
    # Build an embedding network to swap in for the default network. This one
    # will have 2 inputs (mask and word_ids) instead of 3, and won't use
    # positional embeddings.
+    network = Embeddings(vocab_size, hidden_size)
-    word_ids = tf.keras.layers.Input(
-        shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
-    mask = tf.keras.layers.Input(
-        shape=(sequence_length,), dtype=tf.int32, name="input_mask")
-    embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=hidden_size,
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        name="word_embeddings")
-    word_embeddings = embedding_layer(word_ids)
-    attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])
-    network = tf.keras.Model([word_ids, mask],
-                             [word_embeddings, attention_mask])
    hidden_cfg = {
        "num_attention_heads":
@@ -371,8 +380,7 @@ class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=0.02),
        hidden_cfg=hidden_cfg,
-        embedding_cls=network,
+        embedding_cls=network)
-        embedding_data=embedding_layer.embeddings)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -390,11 +398,6 @@ class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
    _ = model.predict([word_id_data, mask_data])
-    # Test that we can get the embedding data that we passed to the object. This
-    # is necessary to support standard language model training.
-    self.assertIs(embedding_layer.embeddings,
-                  test_network.get_embedding_table())
  def test_serialize_deserialize(self):
    hidden_size = 32
    sequence_length = 21

--- a/official/nlp/modeling/networks/span_labeling.py
+++ b/official/nlp/modeling/networks/span_labeling.py
@@ -27,6 +27,8 @@ class SpanLabeling(tf.keras.Model):
  """Span labeling network head for BERT modeling.
  This network implements a simple single-span labeler based on a dense layer.
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
  Arguments:
    input_width: The innermost dimension of the input tensor to this network.

--- a/official/nlp/modeling/networks/token_classification.py
+++ b/official/nlp/modeling/networks/token_classification.py
@@ -27,6 +27,8 @@ class TokenClassification(tf.keras.Model):
  """TokenClassification network head for BERT modeling.
  This network implements a simple token classifier head based on a dense layer.
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
  Arguments:
    input_width: The innermost dimension of the input tensor to this network.

--- a/official/nlp/modeling/networks/transformer_encoder.py
+++ b/official/nlp/modeling/networks/transformer_encoder.py
@@ -39,6 +39,9 @@ class TransformerEncoder(tf.keras.Model):
  in "BERT: Pre-training of Deep Bidirectional Transformers for Language
  Understanding".
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
  Arguments:
    vocab_size: The size of the token vocabulary.
    hidden_size: The size of the transformer hidden layers.

--- a/official/nlp/modeling/ops/__init__.py
+++ b/official/nlp/modeling/ops/__init__.py
--- a/official/nlp/modeling/ops/beam_search.py
+++ b/official/nlp/modeling/ops/beam_search.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Beam search to find the translated sequence with the highest probability."""
+import numpy as np
+import tensorflow as tf
+def inf(dtype):
+  """Returns a value close to infinity, but is still finite in `dtype`.
+  This is useful to get a very large value that is still zero when multiplied by
+  zero. The floating-point "Inf" value is NaN when multiplied by zero.
+  Args:
+    dtype: A dtype. The returned value will be finite when casted to this dtype.
+  Returns:
+    A very large value.
+  """
+  if dtype == "float32" or dtype == "bfloat16":
+    return 1e7
+  elif dtype == "float16":
+    # Disable no-member lint error, as the linter thinks np.float16 does not
+    # exist for some reason.
+    return np.finfo(np.float16).max  # pylint: disable=no-member
+  else:
+    raise AssertionError("Invalid dtype: %s" % dtype)
+class _StateKeys(object):
+  """Keys to dictionary storing the state of the beam search loop."""
+  # Variable storing the loop index.
+  CUR_INDEX = "CUR_INDEX"
+  # Top sequences that are alive for each batch item. Alive sequences are ones
+  # that have not generated an EOS token. Sequences that reach EOS are marked as
+  # finished and moved to the FINISHED_SEQ tensor.
+  # Has shape [batch_size, beam_size, CUR_INDEX + 1]
+  ALIVE_SEQ = "ALIVE_SEQ"
+  # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
+  ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
+  # Dictionary of cached values for each alive sequence. The cache stores
+  # the encoder output, attention bias, and the decoder attention output from
+  # the previous iteration.
+  ALIVE_CACHE = "ALIVE_CACHE"
+  # Top finished sequences for each batch item.
+  # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
+  # shorter than CUR_INDEX + 1 are padded with 0s.
+  FINISHED_SEQ = "FINISHED_SEQ"
+  # Scores for each finished sequence. Score = log probability / length norm
+  # Shape [batch_size, beam_size]
+  FINISHED_SCORES = "FINISHED_SCORES"
+  # Flags indicating which sequences in the finished sequences are finished.
+  # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
+  # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
+  FINISHED_FLAGS = "FINISHED_FLAGS"
+def _expand_to_same_rank(tensor, target):
+  """Expands a given tensor to target's rank to be broadcastable.
+  Args:
+    tensor: input tensor to tile. Shape: [b, d1, ..., da]
+    target: target tensor. Shape: [b, d1, ..., da, ..., dn]
+  Returns:
+    Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
+  Raises:
+    ValueError, if the shape rank of rank tensor/target is None.
+  """
+  if tensor.shape.rank is None:
+    raise ValueError("Expect rank for tensor shape, but got None.")
+  if target.shape.rank is None:
+    raise ValueError("Expect rank for target shape, but got None.")
+  with tf.name_scope("expand_rank"):
+    diff_rank = target.shape.rank - tensor.shape.rank
+    for _ in range(diff_rank):
+      tensor = tf.expand_dims(tensor, -1)
+    return tensor
+class SequenceBeamSearch(tf.Module):
+  """Implementation of beam search loop."""
+  def __init__(self,
+               symbols_to_logits_fn,
+               vocab_size,
+               beam_size,
+               alpha,
+               max_decode_length,
+               eos_id,
+               padded_decode,
+               dtype=tf.float32):
+    """Initialize sequence beam search.
+    Args:
+      symbols_to_logits_fn: A function to provide logits, which is the
+        interface to the Transformer model. The passed in arguments are: ids ->
+          A tensor with shape [batch_size * beam_size, index]. index -> A
+          scalar. cache -> A nested dictionary of tensors [batch_size *
+          beam_size, ...].
+        The function must return a tuple of logits and the updated cache: logits
+          -> A tensor with shape [batch * beam_size, vocab_size]. updated cache
+          -> A nested dictionary with the same structure as the input cache.
+      vocab_size: An integer, the size of the vocabulary, used for topk
+        computation.
+      beam_size: An integer, number of beams for beam search.
+      alpha: A float, defining the strength of length normalization.
+      max_decode_length: An integer, the maximum number of steps to decode a
+        sequence.
+      eos_id: An integer. ID of end of sentence token.
+      padded_decode: A bool, indicating if max_sequence_length padding is used
+        for beam search.
+      dtype: A tensorflow data type used for score computation. The default is
+        tf.float32.
+    """
+    self.symbols_to_logits_fn = symbols_to_logits_fn
+    self.vocab_size = vocab_size
+    self.beam_size = beam_size
+    self.alpha = alpha
+    self.max_decode_length = max_decode_length
+    self.eos_id = eos_id
+    self.padded_decode = padded_decode
+    self.dtype = tf.as_dtype(dtype)
+  def search(self, initial_ids, initial_cache):
+    """Beam search for sequences with highest scores.
+    Args:
+      initial_ids: initial ids to pass into the symbols_to_logits_fn. int tensor
+        with shape [batch_size, 1]
+      initial_cache: dictionary storing values to be passed into the
+        symbols_to_logits_fn.
+    Returns:
+      finished_seq and finished_scores.
+    """
+    batch_size = (
+        initial_ids.shape.as_list()[0]
+        if self.padded_decode else tf.shape(initial_ids)[0])
+    state, state_shapes = self._create_initial_state(initial_ids, initial_cache,
+                                                     batch_size)
+    def _grow_alive_seq(state):
+      """Grow alive sequences by one token, collect top 2*beam_size sequences.
+      2*beam_size sequences are collected because some sequences may have
+      reached the EOS token. 2*beam_size ensures that at least beam_size
+      sequences are still alive.
+      Args:
+        state: A dictionary with the current loop state.
+      Returns:
+        Tuple of
+        (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
+         Scores of returned sequences [batch_size, 2 * beam_size],
+         New alive cache, for each of the 2 * beam_size sequences)
+      """
+      i = state[_StateKeys.CUR_INDEX]
+      alive_seq = state[_StateKeys.ALIVE_SEQ]
+      alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+      alive_cache = state[_StateKeys.ALIVE_CACHE]
+      beams_to_keep = 2 * self.beam_size
+      # Get logits for the next candidate IDs for the alive sequences. Get the
+      # new cache values at the same time.
+      if self.padded_decode:
+        flat_ids = tf.reshape(
+            tf.slice(alive_seq, [0, 0, i], [batch_size, self.beam_size, 1]),
+            [batch_size * self.beam_size, -1])
+      else:
+        flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
+      flat_cache = tf.nest.map_structure(_flatten_beam_dim, alive_cache)
+      flat_logits, flat_cache = self.symbols_to_logits_fn(
+          flat_ids, i, flat_cache)
+      # Unflatten logits to shape [batch_size, beam_size, vocab_size]
+      logits = _unflatten_beam_dim(flat_logits, batch_size, self.beam_size)
+      new_cache = tf.nest.map_structure(
+          lambda t: _unflatten_beam_dim(t, batch_size, self.beam_size),
+          flat_cache)
+      # Convert logits to normalized log probs
+      candidate_log_probs = _log_prob_from_logits(logits)
+      # Calculate new log probabilities if each of the alive sequences were
+      # extended # by the the candidate IDs.
+      # Shape [batch_size, beam_size, vocab_size]
+      log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
+      # Each batch item has beam_size * vocab_size candidate sequences. For each
+      # batch item, get the k candidates with the highest log probabilities.
+      flat_log_probs = tf.reshape(log_probs,
+                                  [-1, self.beam_size * self.vocab_size])
+      topk_log_probs, topk_indices = tf.nn.top_k(
+          flat_log_probs, k=beams_to_keep)
+      # Extract the alive sequences that generate the highest log probabilities
+      # after being extended.
+      topk_beam_indices = topk_indices // self.vocab_size
+      topk_seq, new_cache = _gather_beams([alive_seq, new_cache],
+                                          topk_beam_indices, batch_size,
+                                          beams_to_keep)
+      # Append the most probable IDs to the topk sequences
+      topk_ids = topk_indices % self.vocab_size
+      if self.padded_decode:
+        topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1])
+        # TODO(b/145533236, hongkuny): Reverts once TF fix the validation.
+        topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]],
+                                               tf.expand_dims(topk_ids, axis=0))
+        topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0])
+      else:
+        topk_seq = tf.concat(
+            [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
+      return topk_seq, topk_log_probs, topk_ids, new_cache
+    def _get_new_alive_state(new_seq, new_log_probs, new_finished_flags,
+                             new_cache):
+      """Gather the top k sequences that are still alive.
+      Args:
+        new_seq: New sequences generated by growing the current alive sequences
+          int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
+        new_log_probs: Log probabilities of new sequences float32 tensor with
+          shape [batch_size, beam_size]
+        new_finished_flags: A boolean Tensor indicates which sequences are live
+          inside the beam.
+        new_cache: Dict of cached values for each sequence.
+      Returns:
+        Dictionary with alive keys from _StateKeys:
+          {Top beam_size sequences that are still alive (don't end with eos_id)
+           Log probabilities of top alive sequences
+           Dict cache storing decoder states for top alive sequences}
+      """
+      # To prevent finished sequences from being considered, set log probs to
+      # -inf.
+      new_log_probs += tf.cast(new_finished_flags,
+                               self.dtype) * -inf(self.dtype)
+      top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
+          [new_seq, new_log_probs, new_cache], new_log_probs, batch_size,
+          self.beam_size)
+      return {
+          _StateKeys.ALIVE_SEQ: top_alive_seq,
+          _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
+          _StateKeys.ALIVE_CACHE: top_alive_cache
+      }
+    def _get_new_finished_state(state, new_seq, new_log_probs,
+                                new_finished_flags):
+      """Combine new and old finished sequences, and gather the top k sequences.
+      Args:
+        state: A dictionary with the current loop state.
+        new_seq: New sequences generated by growing the current alive sequences
+          int32 tensor with shape [batch_size, beam_size, i + 1]
+        new_log_probs: Log probabilities of new sequences float32 tensor with
+          shape [batch_size, beam_size]
+        new_finished_flags: A boolean Tensor indicates which sequences are live
+          inside the beam.
+      Returns:
+        Dictionary with finished keys from _StateKeys:
+          {Top beam_size finished sequences based on score,
+           Scores of finished sequences,
+           Finished flags of finished sequences}
+      """
+      i = state[_StateKeys.CUR_INDEX]
+      finished_seq = state[_StateKeys.FINISHED_SEQ]
+      finished_scores = state[_StateKeys.FINISHED_SCORES]
+      finished_flags = state[_StateKeys.FINISHED_FLAGS]
+      # First append a column of 0-ids to finished_seq to increment the length.
+      # New shape of finished_seq: [batch_size, beam_size, i + 1]
+      if not self.padded_decode:
+        finished_seq = tf.concat(
+            [finished_seq,
+             tf.zeros([batch_size, self.beam_size, 1], tf.int32)],
+            axis=2)
+      # Calculate new seq scores from log probabilities.
+      length_norm = _length_normalization(self.alpha, i + 1, dtype=self.dtype)
+      new_scores = new_log_probs / length_norm
+      # Set the scores of the still-alive seq in new_seq to large negative
+      # values.
+      new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) *
+                     -inf(self.dtype))
+      # Combine sequences, scores, and flags.
+      finished_seq = tf.concat([finished_seq, new_seq], axis=1)
+      finished_scores = tf.concat([finished_scores, new_scores], axis=1)
+      finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
+      # Return the finished sequences with the best scores.
+      top_finished_seq, top_finished_scores, top_finished_flags = (
+          _gather_topk_beams([finished_seq, finished_scores, finished_flags],
+                             finished_scores, batch_size, self.beam_size))
+      return {
+          _StateKeys.FINISHED_SEQ: top_finished_seq,
+          _StateKeys.FINISHED_SCORES: top_finished_scores,
+          _StateKeys.FINISHED_FLAGS: top_finished_flags
+      }
+    def _search_step(state):
+      """Beam search loop body.
+      Grow alive sequences by a single ID. Sequences that have reached the EOS
+      token are marked as finished. The alive and finished sequences with the
+      highest log probabilities and scores are returned.
+      A sequence's finished score is calculating by dividing the log probability
+      by the length normalization factor. Without length normalization, the
+      search is more likely to return shorter sequences.
+      Args:
+        state: A dictionary with the current loop state.
+      Returns:
+        new state dictionary.
+      """
+      # Grow alive sequences by one token.
+      new_seq, new_log_probs, topk_ids, new_cache = _grow_alive_seq(state)
+      new_finished_flags = tf.equal(topk_ids, self.eos_id)
+      # Collect top beam_size alive sequences
+      alive_state = _get_new_alive_state(new_seq, new_log_probs,
+                                         new_finished_flags, new_cache)
+      # Combine newly finished sequences with existing finished sequences, and
+      # collect the top k scoring sequences.
+      finished_state = _get_new_finished_state(state, new_seq, new_log_probs,
+                                               new_finished_flags)
+      # Increment loop index and create new state dictionary
+      new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
+      new_state.update(alive_state)
+      new_state.update(finished_state)
+      return [new_state]
+    finished_state = tf.nest.map_structure(
+        tf.stop_gradient,
+        tf.while_loop(
+            self._continue_search,
+            _search_step,
+            loop_vars=[state],
+            shape_invariants=[state_shapes],
+            parallel_iterations=1))
+    finished_state = finished_state[0]
+    return self._process_finished_state(finished_state)
+  def _process_finished_state(self, finished_state):
+    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
+    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
+    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
+    finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
+    finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
+    # TF2 changes tf.where behavior. Should make parameters broadcastable.
+    finished_cond = tf.reduce_any(finished_flags, 1, name="finished_cond")
+    seq_cond = _expand_to_same_rank(finished_cond, finished_seq)
+    score_cond = _expand_to_same_rank(finished_cond, finished_scores)
+    # Account for corner case where there are no finished sequences for a
+    # particular batch item. In that case, return alive sequences for that batch
+    # item.
+    finished_seq = tf.where(seq_cond, finished_seq, alive_seq)
+    finished_scores = tf.where(score_cond, finished_scores, alive_log_probs)
+    return finished_seq, finished_scores
+  def _create_initial_state(self, initial_ids, initial_cache, batch_size):
+    """Return initial state dictionary and its shape invariants."""
+    for key, value in initial_cache.items():
+      for inner_value in tf.nest.flatten(value):
+        if inner_value.dtype != self.dtype:
+          raise TypeError(
+              "initial_cache element for key '%s' has dtype %s that does not "
+              "match SequenceBeamSearch's dtype of %s. Value: %s" %
+              (key, value.dtype.name, self.dtype.name, inner_value))
+    # Current loop index (starts at 0)
+    cur_index = tf.constant(0)
+    # Create alive sequence with shape [batch_size, beam_size, 1]
+    alive_seq = _expand_to_beam_size(initial_ids, self.beam_size)
+    alive_seq = tf.expand_dims(alive_seq, axis=2)
+    if self.padded_decode:
+      alive_seq = tf.tile(alive_seq, [1, 1, self.max_decode_length + 1])
+    # Create tensor for storing initial log probabilities.
+    # Assume initial_ids are prob 1.0
+    initial_log_probs = tf.constant([[0.] + [-float("inf")] *
+                                     (self.beam_size - 1)],
+                                    dtype=self.dtype)
+    alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
+    # Expand all values stored in the dictionary to the beam size, so that each
+    # beam has a separate cache.
+    alive_cache = tf.nest.map_structure(
+        lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
+    # Initialize tensor storing finished sequences with filler values.
+    finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
+    # Set scores of the initial finished seqs to negative infinity.
+    finished_scores = tf.ones([batch_size, self.beam_size],
+                              dtype=self.dtype) * -inf(self.dtype)
+    # Initialize finished flags with all False values.
+    finished_flags = tf.zeros([batch_size, self.beam_size], tf.bool)
+    # Create state dictionary
+    state = {
+        _StateKeys.CUR_INDEX: cur_index,
+        _StateKeys.ALIVE_SEQ: alive_seq,
+        _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
+        _StateKeys.ALIVE_CACHE: alive_cache,
+        _StateKeys.FINISHED_SEQ: finished_seq,
+        _StateKeys.FINISHED_SCORES: finished_scores,
+        _StateKeys.FINISHED_FLAGS: finished_flags
+    }
+    # Create state invariants for each value in the state dictionary. Each
+    # dimension must be a constant or None. A None dimension means either:
+    #   1) the dimension's value is a tensor that remains the same but may
+    #      depend on the input sequence to the model (e.g. batch size).
+    #   2) the dimension may have different values on different iterations.
+    if self.padded_decode:
+      state_shape_invariants = {
+          _StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          _StateKeys.ALIVE_SEQ:
+              tf.TensorShape(
+                  [batch_size, self.beam_size, self.max_decode_length + 1]),
+          _StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([batch_size, self.beam_size]),
+          _StateKeys.ALIVE_CACHE:
+              tf.nest.map_structure(_get_shape, alive_cache),
+          _StateKeys.FINISHED_SEQ:
+              tf.TensorShape(
+                  [batch_size, self.beam_size, self.max_decode_length + 1]),
+          _StateKeys.FINISHED_SCORES:
+              tf.TensorShape([batch_size, self.beam_size]),
+          _StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([batch_size, self.beam_size])
+      }
+    else:
+      state_shape_invariants = {
+          _StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          _StateKeys.ALIVE_SEQ:
+              tf.TensorShape([None, self.beam_size, None]),
+          _StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([None, self.beam_size]),
+          _StateKeys.ALIVE_CACHE:
+              tf.nest.map_structure(_get_shape_keep_last_dim, alive_cache),
+          _StateKeys.FINISHED_SEQ:
+              tf.TensorShape([None, self.beam_size, None]),
+          _StateKeys.FINISHED_SCORES:
+              tf.TensorShape([None, self.beam_size]),
+          _StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([None, self.beam_size])
+      }
+    return state, state_shape_invariants
+  def _continue_search(self, state):
+    """Return whether to continue the search loop.
+    The loops should terminate when
+      1) when decode length has been reached, or
+      2) when the worst score in the finished sequences is better than the best
+         score in the alive sequences (i.e. the finished sequences are provably
+         unchanging)
+    Args:
+      state: A dictionary with the current loop state.
+    Returns:
+      Bool tensor with value True if loop should continue, False if loop should
+      terminate.
+    """
+    i = state[_StateKeys.CUR_INDEX]
+    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+    finished_scores = state[_StateKeys.FINISHED_SCORES]
+    finished_flags = state[_StateKeys.FINISHED_FLAGS]
+    not_at_max_decode_length = tf.less(i, self.max_decode_length)
+    # Calculate largest length penalty (the larger penalty, the better score).
+    max_length_norm = _length_normalization(
+        self.alpha, self.max_decode_length, dtype=self.dtype)
+    # Get the best possible scores from alive sequences.
+    best_alive_scores = alive_log_probs[:, 0] / max_length_norm
+    # Compute worst score in finished sequences for each batch element
+    finished_scores *= tf.cast(finished_flags,
+                               self.dtype)  # set filler scores to zero
+    lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
+    # If there are no finished sequences in a batch element, then set the lowest
+    # finished score to -INF for that element.
+    finished_batches = tf.reduce_any(finished_flags, 1)
+    lowest_finished_scores += ((1.0 - tf.cast(finished_batches, self.dtype)) *
+                               -inf(self.dtype))
+    worst_finished_score_better_than_best_alive_score = tf.reduce_all(
+        tf.greater(lowest_finished_scores, best_alive_scores))
+    return tf.logical_and(
+        not_at_max_decode_length,
+        tf.logical_not(worst_finished_score_better_than_best_alive_score))
+def sequence_beam_search(symbols_to_logits_fn,
+                         initial_ids,
+                         initial_cache,
+                         vocab_size,
+                         beam_size,
+                         alpha,
+                         max_decode_length,
+                         eos_id,
+                         padded_decode=False,
+                         dtype="float32"):
+  """Search for sequence of subtoken ids with the largest probability.
+  Args:
+    symbols_to_logits_fn: A function that takes in ids, index, and cache as
+      arguments. The passed in arguments will have shape: ids -> A tensor with
+        shape [batch_size * beam_size, index]. index -> A scalar. cache -> A
+        nested dictionary of tensors [batch_size * beam_size, ...].
+      The function must return a tuple of logits and new cache: logits -> A
+        tensor with shape [batch * beam_size, vocab_size]. new cache -> A nested
+        dictionary with the same shape/structure as the inputted cache.
+    initial_ids: An int32 tensor with shape [batch_size]. Starting ids for each
+      batch item.
+    initial_cache: A dictionary, containing starting decoder variables
+      information.
+    vocab_size: An integer, the size of tokens.
+    beam_size: An integer, the number of beams.
+    alpha: A float, defining the strength of length normalization.
+    max_decode_length: An integer, the maximum length to decoded a sequence.
+    eos_id: An integer, ID of eos token, used to determine when a sequence has
+      finished.
+    padded_decode: A bool, indicating if max_sequence_length padding is used for
+      beam search.
+    dtype: A tensorflow data type used for score computation. The default is
+      tf.float32.
+  Returns:
+    Top decoded sequences [batch_size, beam_size, max_decode_length]
+    sequence scores [batch_size, beam_size]
+  """
+  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, beam_size, alpha,
+                           max_decode_length, eos_id, padded_decode, dtype)
+  return sbs.search(initial_ids, initial_cache)
+def _log_prob_from_logits(logits):
+  return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True)
+def _length_normalization(alpha, length, dtype=tf.float32):
+  """Return length normalization factor."""
+  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), alpha)
+def _expand_to_beam_size(tensor, beam_size):
+  """Tiles a given tensor by beam_size.
+  Args:
+    tensor: tensor to tile [batch_size, ...]
+    beam_size: How much to tile the tensor by.
+  Returns:
+    Tiled tensor [batch_size, beam_size, ...]
+  """
+  tensor = tf.expand_dims(tensor, axis=1)
+  tile_dims = [1] * tensor.shape.ndims
+  tile_dims[1] = beam_size
+  return tf.tile(tensor, tile_dims)
+def _shape_list(tensor):
+  """Return a list of the tensor's shape, and ensure no None values in list."""
+  # Get statically known shape (may contain None's for unknown dimensions)
+  shape = tensor.get_shape().as_list()
+  # Ensure that the shape values are not None
+  dynamic_shape = tf.shape(tensor)
+  for i in range(len(shape)):  # pylint: disable=consider-using-enumerate
+    if shape[i] is None:
+      shape[i] = dynamic_shape[i]
+  return shape
+def _get_shape_keep_last_dim(tensor):
+  shape_list = _shape_list(tensor)
+  # Only the last
+  for i in range(len(shape_list) - 1):
+    shape_list[i] = None
+  if isinstance(shape_list[-1], tf.Tensor):
+    shape_list[-1] = None
+  return tf.TensorShape(shape_list)
+def _get_shape(tensor):
+  """Return the shape of the input tensor."""
+  return tf.TensorShape(_shape_list(tensor))
+def _flatten_beam_dim(tensor):
+  """Reshapes first two dimensions in to single dimension.
+  Args:
+    tensor: Tensor to reshape of shape [A, B, ...]
+  Returns:
+    Reshaped tensor of shape [A*B, ...]
+  """
+  shape = _shape_list(tensor)
+  shape[0] *= shape[1]
+  shape.pop(1)  # Remove beam dim
+  return tf.reshape(tensor, shape)
+def _unflatten_beam_dim(tensor, batch_size, beam_size):
+  """Reshapes first dimension back to [batch_size, beam_size].
+  Args:
+    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
+    batch_size: Tensor, original batch size.
+    beam_size: int, original beam size.
+  Returns:
+    Reshaped tensor of shape [batch_size, beam_size, ...]
+  """
+  shape = _shape_list(tensor)
+  new_shape = [batch_size, beam_size] + shape[1:]
+  return tf.reshape(tensor, new_shape)
+def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
+  """Gather beams from nested structure of tensors.
+  Each tensor in nested represents a batch of beams, where beam refers to a
+  single search state (beam search involves searching through multiple states
+  in parallel).
+  This function is used to gather the top beams, specified by
+  beam_indices, from the nested tensors.
+  Args:
+    nested: Nested structure (tensor, list, tuple or dict) containing tensors
+      with shape [batch_size, beam_size, ...].
+    beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
+      value in beam_indices must be between [0, beam_size), and are not
+      necessarily unique.
+    batch_size: int size of batch
+    new_beam_size: int number of beams to be pulled from the nested tensors.
+  Returns:
+    Nested structure containing tensors with shape
+      [batch_size, new_beam_size, ...]
+  """
+  # Computes the i'th coodinate that contains the batch index for gather_nd.
+  # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
+  batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
+  batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
+  # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
+  # with shape [batch_size, beam_size, 2], where the last dimension contains
+  # the (i, j) gathering coordinates.
+  coordinates = tf.stack([batch_pos, beam_indices], axis=2)
+  return tf.nest.map_structure(lambda state: tf.gather_nd(state, coordinates),
+                               nested)
+def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
+  """Gather top beams from nested structure."""
+  _, topk_indexes = tf.nn.top_k(score_or_log_prob, k=beam_size)
+  return _gather_beams(nested, topk_indexes, batch_size, beam_size)
--- a/official/nlp/transformer/beam_search_v1_test.py
+++ b/official/nlp/transformer/beam_search_v1_test.py
@@ -14,33 +14,19 @@
 # ==============================================================================
 """Test beam search helper methods."""
-import tensorflow.compat.v1 as tf
+import tensorflow as tf
-from official.nlp.transformer import beam_search_v1 as beam_search
+from official.nlp.modeling.ops import beam_search
 class BeamSearchHelperTests(tf.test.TestCase):
-  def setUp(self):
-    super(BeamSearchHelperTests, self).setUp()
-    tf.compat.v1.disable_eager_execution()
  def test_expand_to_beam_size(self):
    x = tf.ones([7, 4, 2, 5])
    x = beam_search._expand_to_beam_size(x, 3)
-    with self.session() as sess:
+    shape = tf.shape(x)
-      shape = sess.run(tf.shape(x))
    self.assertAllEqual([7, 3, 4, 2, 5], shape)
-  def test_shape_list(self):
-    y = tf.compat.v1.placeholder(dtype=tf.int32, shape=[])
-    x = tf.ones([7, y, 2, 5])
-    shape = beam_search._shape_list(x)
-    self.assertIsInstance(shape[0], int)
-    self.assertIsInstance(shape[1], tf.Tensor)
-    self.assertIsInstance(shape[2], int)
-    self.assertIsInstance(shape[3], int)
  def test_get_shape_keep_last_dim(self):
    y = tf.constant(4.0)
    x = tf.ones([7, tf.cast(tf.sqrt(y), tf.int32), 2, 5])
@@ -51,16 +37,12 @@ class BeamSearchHelperTests(tf.test.TestCase):
  def test_flatten_beam_dim(self):
    x = tf.ones([7, 4, 2, 5])
    x = beam_search._flatten_beam_dim(x)
-    with self.session() as sess:
+    self.assertAllEqual([28, 2, 5], tf.shape(x))
-      shape = sess.run(tf.shape(x))
-    self.assertAllEqual([28, 2, 5], shape)
  def test_unflatten_beam_dim(self):
    x = tf.ones([28, 2, 5])
    x = beam_search._unflatten_beam_dim(x, 7, 4)
-    with self.session() as sess:
+    self.assertAllEqual([7, 4, 2, 5], tf.shape(x))
-      shape = sess.run(tf.shape(x))
-    self.assertAllEqual([7, 4, 2, 5], shape)
  def test_gather_beams(self):
    x = tf.reshape(tf.range(24), [2, 3, 4])
@@ -73,9 +55,6 @@ class BeamSearchHelperTests(tf.test.TestCase):
    #                  [20 21 22 23]]]
    y = beam_search._gather_beams(x, [[1, 2], [0, 2]], 2, 2)
-    with self.session() as sess:
-      y = sess.run(y)
    self.assertAllEqual([[[4, 5, 6, 7],
                          [8, 9, 10, 11]],
                         [[12, 13, 14, 15],
@@ -87,9 +66,6 @@ class BeamSearchHelperTests(tf.test.TestCase):
    x_scores = [[0, 1, 1], [1, 0, 1]]
    y = beam_search._gather_topk_beams(x, x_scores, 2, 2)
-    with self.session() as sess:
-      y = sess.run(y)
    self.assertAllEqual([[[4, 5, 6, 7],
                          [8, 9, 10, 11]],
                         [[12, 13, 14, 15],

--- a/official/nlp/nhnet/models.py
+++ b/official/nlp/nhnet/models.py
@@ -31,7 +31,7 @@ from official.nlp.modeling.layers import multi_channel_attention
 from official.nlp.nhnet import configs
 from official.nlp.nhnet import decoder
 from official.nlp.nhnet import utils
-from official.nlp.transformer import beam_search
+from official.nlp.modeling.ops import beam_search
 def embedding_linear(embedding_matrix, x):

--- a/official/nlp/tasks/electra_task.py
+++ b/official/nlp/tasks/electra_task.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ELECTRA pretraining task (Joint Masked LM and Replaced Token Detection)."""
+import dataclasses
+import tensorflow as tf
+from official.core import base_task
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.configs import bert
+from official.nlp.configs import electra
+from official.nlp.data import pretrain_dataloader
+@dataclasses.dataclass
+class ELECTRAPretrainConfig(cfg.TaskConfig):
+  """The model config."""
+  model: electra.ELECTRAPretrainerConfig = electra.ELECTRAPretrainerConfig(
+      cls_heads=[
+          bert.ClsHeadConfig(
+              inner_dim=768,
+              num_classes=2,
+              dropout_rate=0.1,
+              name='next_sentence')
+      ])
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+@base_task.register_task_cls(ELECTRAPretrainConfig)
+class ELECTRAPretrainTask(base_task.Task):
+  """ELECTRA Pretrain Task (Masked LM + Replaced Token Detection)."""
+  def build_model(self):
+    return electra.instantiate_pretrainer_from_cfg(
+        self.task_config.model)
+  def build_losses(self,
+                   labels,
+                   model_outputs,
+                   metrics,
+                   aux_losses=None) -> tf.Tensor:
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    # generator lm and (optional) nsp loss.
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+        labels['masked_lm_ids'],
+        tf.cast(model_outputs['lm_outputs'], tf.float32),
+        from_logits=True)
+    lm_label_weights = labels['masked_lm_weights']
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
+    metrics['lm_example_loss'].update_state(mlm_loss)
+    if 'next_sentence_labels' in labels:
+      sentence_labels = labels['next_sentence_labels']
+      sentence_outputs = tf.cast(
+          model_outputs['sentence_outputs'], dtype=tf.float32)
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels,
+          sentence_outputs,
+          from_logits=True)
+      metrics['next_sentence_loss'].update_state(sentence_loss)
+      total_loss = mlm_loss + sentence_loss
+    else:
+      total_loss = mlm_loss
+    # discriminator replaced token detection (rtd) loss.
+    rtd_logits = model_outputs['disc_logits']
+    rtd_labels = tf.cast(model_outputs['disc_label'], tf.float32)
+    input_mask = tf.cast(labels['input_mask'], tf.float32)
+    rtd_ind_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        logits=rtd_logits, labels=rtd_labels)
+    rtd_numerator = tf.reduce_sum(input_mask * rtd_ind_loss)
+    rtd_denominator = tf.reduce_sum(input_mask)
+    rtd_loss = tf.math.divide_no_nan(rtd_numerator, rtd_denominator)
+    metrics['discriminator_loss'].update_state(rtd_loss)
+    total_loss = total_loss + \
+        self.task_config.model.discriminator_loss_weight * rtd_loss
+    if aux_losses:
+      total_loss += tf.add_n(aux_losses)
+    metrics['total_loss'].update_state(total_loss)
+    return total_loss
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for pretraining."""
+    if params.input_path == 'dummy':
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
+        return dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids,
+            masked_lm_positions=dummy_lm,
+            masked_lm_ids=dummy_lm,
+            masked_lm_weights=tf.cast(dummy_lm, dtype=tf.float32),
+            next_sentence_labels=tf.zeros((1, 1), dtype=tf.int32))
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+    return pretrain_dataloader.BertPretrainDataLoader(params).load(
+        input_context)
+  def build_metrics(self, training=None):
+    del training
+    metrics = [
+        tf.keras.metrics.SparseCategoricalAccuracy(name='masked_lm_accuracy'),
+        tf.keras.metrics.Mean(name='lm_example_loss'),
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='discriminator_accuracy'),
+    ]
+    if self.task_config.train_data.use_next_sentence_label:
+      metrics.append(
+          tf.keras.metrics.SparseCategoricalAccuracy(
+              name='next_sentence_accuracy'))
+      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
+    metrics.append(tf.keras.metrics.Mean(name='discriminator_loss'))
+    metrics.append(tf.keras.metrics.Mean(name='total_loss'))
+    return metrics
+  def process_metrics(self, metrics, labels, model_outputs):
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    if 'masked_lm_accuracy' in metrics:
+      metrics['masked_lm_accuracy'].update_state(labels['masked_lm_ids'],
+                                                 model_outputs['lm_outputs'],
+                                                 labels['masked_lm_weights'])
+    if 'next_sentence_accuracy' in metrics:
+      metrics['next_sentence_accuracy'].update_state(
+          labels['next_sentence_labels'], model_outputs['sentence_outputs'])
+    if 'discriminator_accuracy' in metrics:
+      disc_logits_expanded = tf.expand_dims(model_outputs['disc_logits'], -1)
+      discrim_full_logits = tf.concat(
+          [-1.0 * disc_logits_expanded, disc_logits_expanded], -1)
+      metrics['discriminator_accuracy'].update_state(
+          model_outputs['disc_label'], discrim_full_logits,
+          labels['input_mask'])
+  def train_step(self, inputs, model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer, metrics):
+    """Does forward and backward.
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+    Returns:
+      A dictionary of logs.
+    """
+    with tf.GradientTape() as tape:
+      outputs = model(inputs, training=True)
+      # Computes per-replica loss.
+      loss = self.build_losses(
+          labels=inputs,
+          model_outputs=outputs,
+          metrics=metrics,
+          aux_losses=model.losses)
+      # Scales loss as the default gradients allreduce performs sum inside the
+      # optimizer.
+      # TODO(b/154564893): enable loss scaling.
+      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
+  def validation_step(self, inputs, model: tf.keras.Model, metrics):
+    """Validatation step.
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+    Returns:
+      A dictionary of logs.
+    """
+    outputs = model(inputs, training=False)
+    loss = self.build_losses(
+        labels=inputs,
+        model_outputs=outputs,
+        metrics=metrics,
+        aux_losses=model.losses)
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}