Merge branch 'tensorflow:master' into panoptic-deeplab-modeling

0225b135 · Srihari Humbarwadi · GitHub · 7479dbb8 · 4c571a3c · 0225b135
Unverified Commit 0225b135 authored Mar 05, 2022 by Srihari Humbarwadi Committed by GitHub Mar 05, 2022
20 changed files
--- a/official/core/export_base.py
+++ b/official/core/export_base.py
@@ -68,8 +68,17 @@ class ExportModule(tf.Module, metaclass=abc.ABCMeta):
    if inference_step is not None:
      self.inference_step = functools.partial(inference_step, model=self.model)
    else:
-      self.inference_step = functools.partial(
+      if issubclass(type(model), tf.keras.Model):
-          self.model.__call__, training=False)
+        # Default to self.model.call instead of self.model.__call__ to avoid
+        # keras tracing logic designed for training.
+        # Since most of Model Garden's call doesn't not have training kwargs
+        # or the default is False, we don't pass anything here.
+        # Please pass custom inference step if your model has training=True as
+        # default.
+        self.inference_step = self.model.call
+      else:
+        self.inference_step = functools.partial(
+            self.model.__call__, training=False)
    self.preprocessor = preprocessor
    self.postprocessor = postprocessor

--- a/official/core/registry.py
+++ b/official/core/registry.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Registry utility."""
+from absl import logging
 def register(registered_collection, reg_key):
@@ -54,8 +55,16 @@ def register(registered_collection, reg_key):
      leaf_reg_key = reg_key
    if leaf_reg_key in collection:
-      raise KeyError("Function or class {} registered multiple times.".format(
+      if "beta" in fn_or_cls.__module__:
-          leaf_reg_key))
+        # TODO(yeqing): Clean this temporary branch for beta.
+        logging.warn(
+            "Duplicate registeration of beta module "
+            "name %r new %r old %r", reg_key, collection[leaf_reg_key],
+            fn_or_cls.__module__)
+        return fn_or_cls
+      else:
+        raise KeyError("Function or class {} registered multiple times.".format(
+            leaf_reg_key))
    collection[leaf_reg_key] = fn_or_cls
    return fn_or_cls

--- a/official/modeling/optimization/configs/optimization_config.py
+++ b/official/modeling/optimization/configs/optimization_config.py
@@ -48,6 +48,8 @@ class OptimizerConfig(oneof.OneOfConfig):
  sgd_experimental: opt_cfg.SGDExperimentalConfig = (
      opt_cfg.SGDExperimentalConfig())
  adam: opt_cfg.AdamConfig = opt_cfg.AdamConfig()
+  adam_experimental: opt_cfg.AdamExperimentalConfig = (
+      opt_cfg.AdamExperimentalConfig())
  adamw: opt_cfg.AdamWeightDecayConfig = opt_cfg.AdamWeightDecayConfig()
  lamb: opt_cfg.LAMBConfig = opt_cfg.LAMBConfig()
  rmsprop: opt_cfg.RMSPropConfig = opt_cfg.RMSPropConfig()

--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
@@ -67,6 +67,7 @@ class SGDExperimentalConfig(BaseOptimizerConfig):
    name: name of the optimizer.
    nesterov: nesterov for SGD optimizer.
    momentum: momentum for SGD optimizer.
+    jit_compile: if True, jit compile will be used.
  """
  name: str = "SGD"
  nesterov: bool = False
@@ -135,6 +136,30 @@ class AdamConfig(BaseOptimizerConfig):
  amsgrad: bool = False
+@dataclasses.dataclass
+class AdamExperimentalConfig(BaseOptimizerConfig):
+  """Configuration for experimental Adam optimizer.
+  The attributes for this class matches the arguments of
+  `tf.keras.optimizer.experimental.Adam`.
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in Adam optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+    jit_compile: if True, jit compile will be used.
+  """
+  name: str = "Adam"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+  jit_compile: bool = False
 @dataclasses.dataclass
 class AdamWeightDecayConfig(BaseOptimizerConfig):
  """Configuration for Adam optimizer with weight decay.

--- a/official/modeling/optimization/optimizer_factory.py
+++ b/official/modeling/optimization/optimizer_factory.py
@@ -30,6 +30,7 @@ OPTIMIZERS_CLS = {
    'sgd': tf.keras.optimizers.SGD,
    'sgd_experimental': tf.keras.optimizers.experimental.SGD,
    'adam': tf.keras.optimizers.Adam,
+    'adam_experimental': tf.keras.optimizers.experimental.Adam,
    'adamw': nlp_optimization.AdamWeightDecay,
    'lamb': tfa_optimizers.LAMB,
    'rmsprop': tf.keras.optimizers.RMSprop,

--- a/official/nlp/modeling/layers/masked_lm.py
+++ b/official/nlp/modeling/layers/masked_lm.py
@@ -115,7 +115,8 @@ class MaskedLM(tf.keras.layers.Layer):
    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
-    flat_positions = tf.reshape(positions + flat_offsets, [-1])
+    flat_positions = tf.reshape(
+        positions + tf.cast(flat_offsets, positions.dtype), [-1])
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                      [batch_size * seq_length, width])
    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)

--- a/official/nlp/modeling/layers/text_layers.py
+++ b/official/nlp/modeling/layers/text_layers.py
@@ -20,8 +20,10 @@ from absl import logging
 import tensorflow as tf
 try:
+  # pytype: disable=import-error
  import tensorflow_text as text
  from tensorflow_text.python.ops import bert_tokenizer
+  # pytype: enable=import-error
 except ImportError:
  text = None
  bert_tokenizer = None

--- a/official/nlp/modeling/networks/funnel_transformer.py
+++ b/official/nlp/modeling/networks/funnel_transformer.py
@@ -226,6 +226,7 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
      funnel encoder relies on.
    share_rezero: bool. Whether to share ReZero alpha between the attention
      layer and the ffn layer. This option is specific to ReZero.
+    with_dense_inputs: Whether to accept dense embeddings as the input.
  """
  def __init__(
@@ -402,12 +403,22 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
            _transformer_cls2str.get(transformer_cls, str(transformer_cls))
    }
+    self.inputs = dict(
+        input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32))
  def call(self, inputs):
    # inputs are [word_ids, mask, type_ids]
    if isinstance(inputs, (list, tuple)):
      logging.warning('List inputs to  %s are discouraged.', self.__class__)
      if len(inputs) == 3:
        word_ids, mask, type_ids = inputs
+        dense_inputs = None
+        dense_mask = None
+        dense_type_ids = None
+      elif len(inputs) == 6:
+        word_ids, mask, type_ids, dense_inputs, dense_mask, dense_type_ids = inputs
      else:
        raise ValueError('Unexpected inputs to %s with length at %d.' %
                         (self.__class__, len(inputs)))
@@ -415,10 +426,21 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
      word_ids = inputs.get('input_word_ids')
      mask = inputs.get('input_mask')
      type_ids = inputs.get('input_type_ids')
+      dense_inputs = inputs.get('dense_inputs', None)
+      dense_mask = inputs.get('dense_mask', None)
+      dense_type_ids = inputs.get('dense_type_ids', None)
    else:
      raise ValueError('Unexpected inputs type to %s.' % self.__class__)
    word_embeddings = self._embedding_layer(word_ids)
+    if dense_inputs is not None:
+      # Concat the dense embeddings at sequence begin so unpool_len can control
+      # embedding not being pooled.
+      word_embeddings = tf.concat([dense_inputs, word_embeddings], axis=1)
+      type_ids = tf.concat([dense_type_ids, type_ids], axis=1)
+      mask = tf.concat([dense_mask, mask], axis=1)
    # absolute position embeddings
    position_embeddings = self._position_embedding_layer(word_embeddings)
    type_embeddings = self._type_embedding_layer(type_ids)

--- a/official/nlp/modeling/networks/funnel_transformer_test.py
+++ b/official/nlp/modeling/networks/funnel_transformer_test.py
@@ -101,6 +101,55 @@ class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
    self.assertAllEqual(tf.float32, data.dtype)
    self.assertAllEqual(pooled_dtype, pooled.dtype)
+  def test_network_creation_dense(self):
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
+    pool_type = "avg"
+    hidden_size = 32
+    sequence_length = 21
+    dense_sequence_length = 3
+    pool_stride = 2
+    num_layers = 3
+    # Create a small FunnelTransformerEncoder for testing.
+    test_network = funnel_transformer.FunnelTransformerEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=num_layers,
+        pool_stride=pool_stride,
+        pool_type=pool_type,
+        max_sequence_length=sequence_length + dense_sequence_length,
+        unpool_length=0,
+        transformer_cls="TransformerEncoderBlock")
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dense_inputs = tf.keras.Input(
+        shape=(dense_sequence_length, hidden_size), dtype=tf.float32)
+    dense_mask = tf.keras.Input(shape=(dense_sequence_length,), dtype=tf.int32)
+    dense_type_ids = tf.keras.Input(
+        shape=(dense_sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network(
+        [word_ids, mask, type_ids, dense_inputs, dense_mask, dense_type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    self.assertIsInstance(test_network.transformer_layers, list)
+    self.assertLen(test_network.transformer_layers, num_layers)
+    self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
+    # Stride=2 compresses sequence length to half the size at each layer.
+    # For pool_type = max or avg,
+    # this configuration gives each layer of seq length: 24->12->6->3.
+    expected_data_shape = [None, 3, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
  def test_invalid_stride_and_num_layers(self):
    hidden_size = 32
    num_layers = 3

--- a/official/nlp/serving/serving_modules.py
+++ b/official/nlp/serving/serving_modules.py
@@ -417,6 +417,8 @@ class Translation(export_base.ExportModule):
  @dataclasses.dataclass
  class Params(base_config.Config):
    sentencepiece_model_path: str = ""
+    # Needs to be specified if padded_decode is True/on TPUs.
+    batch_size: Optional[int] = None
  def __init__(self, params, model: tf.keras.Model, inference_step=None):
    super().__init__(params, model, inference_step)
@@ -431,6 +433,7 @@ class Translation(export_base.ExportModule):
          "Please make sure the tokenizer generates a single token for an "
          "empty string.")
    self._eos_id = empty_str_tokenized.item()
+    self._batch_size = params.batch_size
  @tf.function
  def serve(self, inputs) -> Dict[str, tf.Tensor]:
@@ -452,5 +455,6 @@ class Translation(export_base.ExportModule):
                         (self.__class__, func_key, valid_keys))
      if func_key == "serve_text":
        signatures[signature_key] = self.serve_text.get_concrete_function(
-            tf.TensorSpec(shape=[None], dtype=tf.string, name="text"))
+            tf.TensorSpec(shape=[self._batch_size],
+                          dtype=tf.string, name="text"))
    return signatures
--- a/official/nlp/serving/serving_modules_test.py
+++ b/official/nlp/serving/serving_modules_test.py
@@ -20,6 +20,7 @@ from absl.testing import parameterized
 import tensorflow as tf
 from sentencepiece import SentencePieceTrainer
+from official.core import export_base
 from official.nlp.configs import bert
 from official.nlp.configs import encoders
 from official.nlp.serving import serving_modules
@@ -343,7 +344,10 @@ class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
    with self.assertRaises(ValueError):
      _ = export_module.get_inference_signatures({"foo": None})
-  def test_translation(self):
+  @parameterized.parameters(
+      (False, None),
+      (True, 2))
+  def test_translation(self, padded_decode, batch_size):
    sp_path = _make_sentencepeice(self.get_temp_dir())
    encdecoder = translation.EncDecoder(
        num_attention_heads=4, intermediate_size=256)
@@ -352,7 +356,7 @@ class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
            encoder=encdecoder,
            decoder=encdecoder,
            embedding_width=256,
-            padded_decode=False,
+            padded_decode=padded_decode,
            decode_max_length=100),
        sentencepiece_model_path=sp_path,
    )
@@ -360,7 +364,7 @@ class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
    model = task.build_model()
    params = serving_modules.Translation.Params(
-        sentencepiece_model_path=sp_path)
+        sentencepiece_model_path=sp_path, batch_size=batch_size)
    export_module = serving_modules.Translation(params=params, model=model)
    functions = export_module.get_inference_signatures({
        "serve_text": "serving_default"
@@ -369,5 +373,19 @@ class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
    self.assertEqual(outputs.shape, (2,))
    self.assertEqual(outputs.dtype, tf.string)
+    tmp_dir = self.get_temp_dir()
+    tmp_dir = os.path.join(tmp_dir, "padded_decode", str(padded_decode))
+    export_base_dir = os.path.join(tmp_dir, "export")
+    ckpt_dir = os.path.join(tmp_dir, "ckpt")
+    ckpt_path = tf.train.Checkpoint(model=model).save(ckpt_dir)
+    export_dir = export_base.export(export_module,
+                                    {"serve_text": "serving_default"},
+                                    export_base_dir, ckpt_path)
+    loaded = tf.saved_model.load(export_dir)
+    infer = loaded.signatures["serving_default"]
+    out = infer(text=tf.constant(["abcd", "ef gh"]))
+    self.assertLen(out["output_0"], 2)
 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/tools/export_tfhub_lib.py
+++ b/official/nlp/tools/export_tfhub_lib.py
@@ -84,13 +84,13 @@ def _create_model(
  """Creates the model to export and the model to restore the checkpoint.
  Args:
-    bert_config: A legacy `BertConfig` to create a `BertEncoder` object.
+    bert_config: A legacy `BertConfig` to create a `BertEncoder` object. Exactly
-      Exactly one of encoder_config and bert_config must be set.
+      one of encoder_config and bert_config must be set.
    encoder_config: An `EncoderConfig` to create an encoder of the configured
      type (`BertEncoder` or other).
-    with_mlm: A bool to control the second component of the result.
+    with_mlm: A bool to control the second component of the result. If True,
-      If True, will create a `BertPretrainerV2` object; otherwise, will
+      will create a `BertPretrainerV2` object; otherwise, will create a
-      create a `BertEncoder` object.
+      `BertEncoder` object.
  Returns:
    A Tuple of (1) a Keras model that will be exported, (2) a `BertPretrainerV2`
@@ -110,7 +110,11 @@ def _create_model(
  # Convert from list of named inputs to dict of inputs keyed by name.
  # Only the latter accepts a dict of inputs after restoring from SavedModel.
-  encoder_inputs_dict = {x.name: x for x in encoder.inputs}
+  if isinstance(encoder.inputs, list) or isinstance(encoder.inputs, tuple):
+    encoder_inputs_dict = {x.name: x for x in encoder.inputs}
+  else:
+    # encoder.inputs by default is dict for BertEncoderV2.
+    encoder_inputs_dict = encoder.inputs
  encoder_output_dict = encoder(encoder_inputs_dict)
  # For interchangeability with other text representations,
  # add "default" as an alias for BERT's whole-input reptesentations.
@@ -206,26 +210,28 @@ def export_model(export_path: Text,
    encoder_config: An optional `encoders.EncoderConfig` object.
    model_checkpoint_path: The path to the checkpoint.
    with_mlm: Whether to export the additional mlm sub-object.
-    copy_pooler_dense_to_encoder: Whether to copy the pooler's dense layer
+    copy_pooler_dense_to_encoder: Whether to copy the pooler's dense layer used
-      used in the next sentence prediction task to the encoder.
+      in the next sentence prediction task to the encoder.
    vocab_file: The path to the wordpiece vocab file, or None.
-    sp_model_file: The path to the sentencepiece model file, or None.
+    sp_model_file: The path to the sentencepiece model file, or None. Exactly
-      Exactly one of vocab_file and sp_model_file must be set.
+      one of vocab_file and sp_model_file must be set.
    do_lower_case: Whether to lower-case text before tokenization.
  """
  if with_mlm:
-    core_model, pretrainer = _create_model(bert_config=bert_config,
+    core_model, pretrainer = _create_model(
-                                           encoder_config=encoder_config,
+        bert_config=bert_config,
-                                           with_mlm=with_mlm)
+        encoder_config=encoder_config,
+        with_mlm=with_mlm)
    encoder = pretrainer.encoder_network
    # It supports both the new pretrainer checkpoint produced by TF-NLP and
    # the checkpoint converted from TF1 (original BERT, SmallBERTs).
    checkpoint_items = pretrainer.checkpoint_items
    checkpoint = tf.train.Checkpoint(**checkpoint_items)
  else:
-    core_model, encoder = _create_model(bert_config=bert_config,
+    core_model, encoder = _create_model(
-                                        encoder_config=encoder_config,
+        bert_config=bert_config,
-                                        with_mlm=with_mlm)
+        encoder_config=encoder_config,
+        with_mlm=with_mlm)
    checkpoint = tf.train.Checkpoint(
        model=encoder,  # Legacy checkpoints.
        encoder=encoder)
@@ -279,21 +285,26 @@ class BertPackInputsSavedModelWrapper(tf.train.Checkpoint):
    # overridable. Having this dynamically determined default argument
    # requires self.__call__ to be defined in this indirect way.
    default_seq_length = bert_pack_inputs.seq_length
    @tf.function(autograph=False)
    def call(inputs, seq_length=default_seq_length):
      return layers.BertPackInputs.bert_pack_inputs(
-          inputs, seq_length=seq_length,
+          inputs,
+          seq_length=seq_length,
          start_of_sequence_id=bert_pack_inputs.start_of_sequence_id,
          end_of_segment_id=bert_pack_inputs.end_of_segment_id,
          padding_id=bert_pack_inputs.padding_id)
    self.__call__ = call
    for ragged_rank in range(1, 3):
      for num_segments in range(1, 3):
-        _ = self.__call__.get_concrete_function(
+        _ = self.__call__.get_concrete_function([
-            [tf.RaggedTensorSpec([None] * (ragged_rank + 1), dtype=tf.int32)
+            tf.RaggedTensorSpec([None] * (ragged_rank + 1), dtype=tf.int32)
-             for _ in range(num_segments)],
+            for _ in range(num_segments)
-            seq_length=tf.TensorSpec([], tf.int32))
+        ],
+                                                seq_length=tf.TensorSpec(
+                                                    [], tf.int32))
 def create_preprocessing(*,
@@ -311,14 +322,14 @@ def create_preprocessing(*,
  Args:
    vocab_file: The path to the wordpiece vocab file, or None.
-    sp_model_file: The path to the sentencepiece model file, or None.
+    sp_model_file: The path to the sentencepiece model file, or None. Exactly
-      Exactly one of vocab_file and sp_model_file must be set.
+      one of vocab_file and sp_model_file must be set. This determines the type
-      This determines the type of tokenzer that is used.
+      of tokenzer that is used.
    do_lower_case: Whether to do lower case.
    tokenize_with_offsets: Whether to include the .tokenize_with_offsets
      subobject.
-    default_seq_length: The sequence length of preprocessing results from
+    default_seq_length: The sequence length of preprocessing results from root
-      root callable. This is also the default sequence length for the
+      callable. This is also the default sequence length for the
      bert_pack_inputs subobject.
  Returns:
@@ -378,7 +389,8 @@ def create_preprocessing(*,
 def _move_to_tmpdir(file_path: Optional[Text], tmpdir: Text) -> Optional[Text]:
  """Returns new path with same basename and hash of original path."""
-  if file_path is None: return None
+  if file_path is None:
+    return None
  olddir, filename = os.path.split(file_path)
  hasher = hashlib.sha1()
  hasher.update(olddir.encode("utf-8"))
@@ -460,12 +472,17 @@ def _check_no_assert(saved_model_path):
  assert_nodes = []
  graph_def = saved_model.meta_graphs[0].graph_def
-  assert_nodes += ["node '{}' in global graph".format(n.name)
+  assert_nodes += [
-                   for n in graph_def.node if n.op == "Assert"]
+      "node '{}' in global graph".format(n.name)
+      for n in graph_def.node
+      if n.op == "Assert"
+  ]
  for fdef in graph_def.library.function:
    assert_nodes += [
        "node '{}' in function '{}'".format(n.name, fdef.signature.name)
-        for n in fdef.node_def if n.op == "Assert"]
+        for n in fdef.node_def
+        if n.op == "Assert"
+    ]
  if assert_nodes:
    raise AssertionError(
        "Internal tool error: "

--- a/official/nlp/tools/export_tfhub_lib_test.py
+++ b/official/nlp/tools/export_tfhub_lib_test.py
@@ -32,9 +32,26 @@ from official.nlp.modeling import models
 from official.nlp.tools import export_tfhub_lib
-def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
+def _get_bert_config_or_encoder_config(use_bert_config,
-                                       num_hidden_layers, vocab_size=100):
+                                       hidden_size,
-  """Returns config args for export_tfhub_lib._create_model()."""
+                                       num_hidden_layers,
+                                       encoder_type="albert",
+                                       vocab_size=100):
+  """Generates config args for export_tfhub_lib._create_model().
+  Args:
+    use_bert_config: bool. If True, returns legacy BertConfig.
+    hidden_size: int.
+    num_hidden_layers: int.
+    encoder_type: str. Can be ['albert', 'bert', 'bert_v2']. If use_bert_config
+      == True, then model_type is not used.
+    vocab_size: int.
+  Returns:
+    bert_config, encoder_config. Only one is not None. If
+      `use_bert_config` == True, the first config is valid. Otherwise
+      `bert_config` == None.
+  """
  if use_bert_config:
    bert_config = configs.BertConfig(
        vocab_size=vocab_size,
@@ -46,17 +63,31 @@ def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
    encoder_config = None
  else:
    bert_config = None
-    encoder_config = encoders.EncoderConfig(
+    if encoder_type == "albert":
-        type="albert",
+      encoder_config = encoders.EncoderConfig(
-        albert=encoders.AlbertEncoderConfig(
+          type="albert",
-            vocab_size=vocab_size,
+          albert=encoders.AlbertEncoderConfig(
-            embedding_width=16,
+              vocab_size=vocab_size,
-            hidden_size=hidden_size,
+              embedding_width=16,
-            intermediate_size=32,
+              hidden_size=hidden_size,
-            max_position_embeddings=128,
+              intermediate_size=32,
-            num_attention_heads=2,
+              max_position_embeddings=128,
-            num_layers=num_hidden_layers,
+              num_attention_heads=2,
-            dropout_rate=0.1))
+              num_layers=num_hidden_layers,
+              dropout_rate=0.1))
+    else:
+      # encoder_type can be 'bert' or 'bert_v2'.
+      model_config = encoders.BertEncoderConfig(
+          vocab_size=vocab_size,
+          embedding_size=16,
+          hidden_size=hidden_size,
+          intermediate_size=32,
+          max_position_embeddings=128,
+          num_attention_heads=2,
+          num_layers=num_hidden_layers,
+          dropout_rate=0.1)
+      kwargs = {"type": encoder_type, encoder_type: model_config}
+      encoder_config = encoders.EncoderConfig(**kwargs)
  return bert_config, encoder_config
@@ -105,13 +136,18 @@ class ExportModelTest(tf.test.TestCase, parameterized.TestCase):
      alternative to BertTokenizer).
  """
-  @parameterized.named_parameters(("Bert", True), ("Albert", False))
+  @parameterized.named_parameters(
-  def test_export_model(self, use_bert):
+      ("Bert_Legacy", True, None), ("Albert", False, "albert"),
+      ("BertEncoder", False, "bert"), ("BertEncoderV2", False, "bert_v2"))
+  def test_export_model(self, use_bert, encoder_type):
    # Create the encoder and export it.
    hidden_size = 16
    num_hidden_layers = 1
    bert_config, encoder_config = _get_bert_config_or_encoder_config(
-        use_bert, hidden_size, num_hidden_layers)
+        use_bert,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        encoder_type=encoder_type)
    bert_model, encoder = export_tfhub_lib._create_model(
        bert_config=bert_config, encoder_config=encoder_config, with_mlm=False)
    self.assertEmpty(
@@ -151,8 +187,8 @@ class ExportModelTest(tf.test.TestCase, parameterized.TestCase):
                       _read_asset(hub_layer.resolved_object.sp_model_file))
    # Check restored weights.
-    self.assertEqual(len(bert_model.trainable_weights),
+    self.assertEqual(
-                     len(hub_layer.trainable_weights))
+        len(bert_model.trainable_weights), len(hub_layer.trainable_weights))
    for source_weight, hub_weight in zip(bert_model.trainable_weights,
                                         hub_layer.trainable_weights):
      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
@@ -334,8 +370,8 @@ class ExportModelWithMLMTest(tf.test.TestCase, parameterized.TestCase):
    # Note that we set `_auto_track_sub_layers` to False when exporting the
    # SavedModel, so hub_layer has the same number of weights as bert_model;
    # otherwise, hub_layer will have extra weights from its `mlm` subobject.
-    self.assertEqual(len(bert_model.trainable_weights),
+    self.assertEqual(
-                     len(hub_layer.trainable_weights))
+        len(bert_model.trainable_weights), len(hub_layer.trainable_weights))
    for source_weight, hub_weight in zip(bert_model.trainable_weights,
                                         hub_layer.trainable_weights):
      self.assertAllClose(source_weight, hub_weight)
@@ -473,10 +509,11 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
      The absolute filename of the created vocab file.
    """
    full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"
-                  ] + ["[MASK]"]*add_mask_token + vocab
+                 ] + ["[MASK]"] * add_mask_token + vocab
    path = os.path.join(
-        tempfile.mkdtemp(dir=self.get_temp_dir(),  # New subdir each time.
+        tempfile.mkdtemp(
-                         prefix=_STRING_NOT_TO_LEAK),
+            dir=self.get_temp_dir(),  # New subdir each time.
+            prefix=_STRING_NOT_TO_LEAK),
        filename)
    with tf.io.gfile.GFile(path, "w") as f:
      f.write("\n".join(full_vocab + [""]))
@@ -522,22 +559,30 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
        model_prefix=model_prefix,
        model_type="word",
        input=input_file,
-        pad_id=0, unk_id=1, control_symbols=control_symbols,
+        pad_id=0,
+        unk_id=1,
+        control_symbols=control_symbols,
        vocab_size=full_vocab_size,
-        bos_id=full_vocab_size-2, eos_id=full_vocab_size-1)
+        bos_id=full_vocab_size - 2,
-    SentencePieceTrainer.Train(
+        eos_id=full_vocab_size - 1)
-        " ".join(["--{}={}".format(k, v) for k, v in flags.items()]))
+    SentencePieceTrainer.Train(" ".join(
+        ["--{}={}".format(k, v) for k, v in flags.items()]))
    return model_prefix + ".model"
-  def _do_export(self, vocab, do_lower_case, default_seq_length=128,
+  def _do_export(self,
-                 tokenize_with_offsets=True, use_sp_model=False,
+                 vocab,
-                 experimental_disable_assert=False, add_mask_token=False):
+                 do_lower_case,
+                 default_seq_length=128,
+                 tokenize_with_offsets=True,
+                 use_sp_model=False,
+                 experimental_disable_assert=False,
+                 add_mask_token=False):
    """Runs SavedModel export and returns the export_path."""
    export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
    vocab_file = sp_model_file = None
    if use_sp_model:
-      sp_model_file = self._make_sp_model_file(vocab,
+      sp_model_file = self._make_sp_model_file(
-                                               add_mask_token=add_mask_token)
+          vocab, add_mask_token=add_mask_token)
    else:
      vocab_file = self._make_vocab_file(vocab, add_mask_token=add_mask_token)
    export_tfhub_lib.export_preprocessing(
@@ -554,19 +599,24 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
  def test_no_leaks(self):
    """Tests not leaking the path to the original vocab file."""
-    path = self._do_export(
+    path = self._do_export(["d", "ef", "abc", "xy"],
-        ["d", "ef", "abc", "xy"], do_lower_case=True, use_sp_model=False)
+                           do_lower_case=True,
+                           use_sp_model=False)
    with tf.io.gfile.GFile(os.path.join(path, "saved_model.pb"), "rb") as f:
      self.assertFalse(  # pylint: disable=g-generic-assert
          _STRING_NOT_TO_LEAK.encode("ascii") in f.read())
  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
  def test_exported_callables(self, use_sp_model):
-    preprocess = tf.saved_model.load(self._do_export(
+    preprocess = tf.saved_model.load(
-        ["d", "ef", "abc", "xy"], do_lower_case=True,
+        self._do_export(
-        tokenize_with_offsets=not use_sp_model,  # TODO(b/181866850): drop this.
+            ["d", "ef", "abc", "xy"],
-        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+            do_lower_case=True,
-        use_sp_model=use_sp_model))
+            # TODO(b/181866850): drop this.
+            tokenize_with_offsets=not use_sp_model,
+            # TODO(b/175369555): drop this.
+            experimental_disable_assert=True,
+            use_sp_model=use_sp_model))
    def fold_dim(rt):
      """Removes the word/subword distinction of BertTokenizer."""
@@ -575,18 +625,20 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
    # .tokenize()
    inputs = tf.constant(["abc d ef", "ABC D EF d"])
    token_ids = preprocess.tokenize(inputs)
-    self.assertAllEqual(fold_dim(token_ids),
+    self.assertAllEqual(
-                        tf.ragged.constant([[6, 4, 5],
+        fold_dim(token_ids), tf.ragged.constant([[6, 4, 5], [6, 4, 5, 4]]))
-                                            [6, 4, 5, 4]]))
    special_tokens_dict = {
        k: v.numpy().item()  # Expecting eager Tensor, converting to Python.
-        for k, v in preprocess.tokenize.get_special_tokens_dict().items()}
+        for k, v in preprocess.tokenize.get_special_tokens_dict().items()
-    self.assertDictEqual(special_tokens_dict,
+    }
-                         dict(padding_id=0,
+    self.assertDictEqual(
-                              start_of_sequence_id=2,
+        special_tokens_dict,
-                              end_of_segment_id=3,
+        dict(
-                              vocab_size=4+6 if use_sp_model else 4+4))
+            padding_id=0,
+            start_of_sequence_id=2,
+            end_of_segment_id=3,
+            vocab_size=4 + 6 if use_sp_model else 4 + 4))
    # .tokenize_with_offsets()
    if use_sp_model:
@@ -595,92 +647,104 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
    else:
      token_ids, start_offsets, limit_offsets = (
          preprocess.tokenize_with_offsets(inputs))
-      self.assertAllEqual(fold_dim(token_ids),
+      self.assertAllEqual(
-                          tf.ragged.constant([[6, 4, 5],
+          fold_dim(token_ids), tf.ragged.constant([[6, 4, 5], [6, 4, 5, 4]]))
-                                              [6, 4, 5, 4]]))
+      self.assertAllEqual(
-      self.assertAllEqual(fold_dim(start_offsets),
+          fold_dim(start_offsets), tf.ragged.constant([[0, 4, 6], [0, 4, 6,
-                          tf.ragged.constant([[0, 4, 6],
+                                                                   9]]))
-                                              [0, 4, 6, 9]]))
+      self.assertAllEqual(
-      self.assertAllEqual(fold_dim(limit_offsets),
+          fold_dim(limit_offsets), tf.ragged.constant([[3, 5, 8], [3, 5, 8,
-                          tf.ragged.constant([[3, 5, 8],
+                                                                   10]]))
-                                              [3, 5, 8, 10]]))
      self.assertIs(preprocess.tokenize.get_special_tokens_dict,
                    preprocess.tokenize_with_offsets.get_special_tokens_dict)
    # Root callable.
    bert_inputs = preprocess(inputs)
    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 128])
-    self.assertAllEqual(bert_inputs["input_word_ids"][:, :10],
+    self.assertAllEqual(
-                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+        bert_inputs["input_word_ids"][:, :10],
-                                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
+        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 128])
-    self.assertAllEqual(bert_inputs["input_mask"][:, :10],
+    self.assertAllEqual(
-                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+        bert_inputs["input_mask"][:, :10],
-                                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
+        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 128])
-    self.assertAllEqual(bert_inputs["input_type_ids"][:, :10],
+    self.assertAllEqual(
-                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        bert_inputs["input_type_ids"][:, :10],
-                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
    # .bert_pack_inputs()
    inputs_2 = tf.constant(["d xy", "xy abc"])
    token_ids_2 = preprocess.tokenize(inputs_2)
-    bert_inputs = preprocess.bert_pack_inputs(
+    bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2],
-        [token_ids, token_ids_2], seq_length=256)
+                                              seq_length=256)
    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 256])
-    self.assertAllEqual(bert_inputs["input_word_ids"][:, :10],
+    self.assertAllEqual(
-                        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
+        bert_inputs["input_word_ids"][:, :10],
-                                     [2, 6, 4, 5, 4, 3, 7, 6, 3, 0]]))
+        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
+                     [2, 6, 4, 5, 4, 3, 7, 6, 3, 0]]))
    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 256])
-    self.assertAllEqual(bert_inputs["input_mask"][:, :10],
+    self.assertAllEqual(
-                        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+        bert_inputs["input_mask"][:, :10],
-                                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]))
+        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]))
    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 256])
-    self.assertAllEqual(bert_inputs["input_type_ids"][:, :10],
+    self.assertAllEqual(
-                        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+        bert_inputs["input_type_ids"][:, :10],
-                                     [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]]))
+        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+                     [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]]))
  # For BertTokenizer only: repeat relevant parts for do_lower_case=False,
  # default_seq_length=10, experimental_disable_assert=False,
  # tokenize_with_offsets=False, and without folding the word/subword dimension.
  def test_cased_length10(self):
-    preprocess = tf.saved_model.load(self._do_export(
+    preprocess = tf.saved_model.load(
-        ["d", "##ef", "abc", "ABC"],
+        self._do_export(["d", "##ef", "abc", "ABC"],
-        do_lower_case=False, default_seq_length=10,
+                        do_lower_case=False,
-        tokenize_with_offsets=False,
+                        default_seq_length=10,
-        use_sp_model=False,
+                        tokenize_with_offsets=False,
-        experimental_disable_assert=False))
+                        use_sp_model=False,
+                        experimental_disable_assert=False))
    inputs = tf.constant(["abc def", "ABC DEF"])
    token_ids = preprocess.tokenize(inputs)
-    self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
+    self.assertAllEqual(token_ids,
-                                                       [[7], [1]]]))
+                        tf.ragged.constant([[[6], [4, 5]], [[7], [1]]]))
    self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
    bert_inputs = preprocess(inputs)
-    self.assertAllEqual(bert_inputs["input_word_ids"],
+    self.assertAllEqual(
-                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+        bert_inputs["input_word_ids"],
-                                     [2, 7, 1, 3, 0, 0, 0, 0, 0, 0]]))
+        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
-    self.assertAllEqual(bert_inputs["input_mask"],
+                     [2, 7, 1, 3, 0, 0, 0, 0, 0, 0]]))
-                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+    self.assertAllEqual(
-                                     [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]))
+        bert_inputs["input_mask"],
-    self.assertAllEqual(bert_inputs["input_type_ids"],
+        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                     [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]))
-                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+    self.assertAllEqual(
+        bert_inputs["input_type_ids"],
+        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
    inputs_2 = tf.constant(["d ABC", "ABC abc"])
    token_ids_2 = preprocess.tokenize(inputs_2)
    bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2])
    # Test default seq_length=10.
-    self.assertAllEqual(bert_inputs["input_word_ids"],
+    self.assertAllEqual(
-                        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
+        bert_inputs["input_word_ids"],
-                                     [2, 7, 1, 3, 7, 6, 3, 0, 0, 0]]))
+        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
-    self.assertAllEqual(bert_inputs["input_mask"],
+                     [2, 7, 1, 3, 7, 6, 3, 0, 0, 0]]))
-                        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
+    self.assertAllEqual(
-                                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))
+        bert_inputs["input_mask"],
-    self.assertAllEqual(bert_inputs["input_type_ids"],
+        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
-                        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))
-                                     [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]))
+    self.assertAllEqual(
+        bert_inputs["input_type_ids"],
+        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+                     [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]))
  # XLA requires fixed shapes for tensors found in graph mode.
  # Statically known shapes in Python are a particularly firm way to
@@ -689,16 +753,21 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
  # inference when applied to fully or partially known input shapes.
  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
  def test_shapes(self, use_sp_model):
-    preprocess = tf.saved_model.load(self._do_export(
+    preprocess = tf.saved_model.load(
-        ["abc", "def"], do_lower_case=True,
+        self._do_export(
-        tokenize_with_offsets=not use_sp_model,  # TODO(b/181866850): drop this.
+            ["abc", "def"],
-        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+            do_lower_case=True,
-        use_sp_model=use_sp_model))
+            # TODO(b/181866850): drop this.
+            tokenize_with_offsets=not use_sp_model,
+            # TODO(b/175369555): drop this.
+            experimental_disable_assert=True,
+            use_sp_model=use_sp_model))
    def expected_bert_input_shapes(batch_size, seq_length):
-      return dict(input_word_ids=[batch_size, seq_length],
+      return dict(
-                  input_mask=[batch_size, seq_length],
+          input_word_ids=[batch_size, seq_length],
-                  input_type_ids=[batch_size, seq_length])
+          input_mask=[batch_size, seq_length],
+          input_type_ids=[batch_size, seq_length])
    for batch_size in [7, None]:
      if use_sp_model:
@@ -706,11 +775,9 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
      else:
        token_out_shape = [batch_size, None, None]
      self.assertEqual(
-          _result_shapes_in_tf_function(
+          _result_shapes_in_tf_function(preprocess.tokenize,
-              preprocess.tokenize,
+                                        tf.TensorSpec([batch_size], tf.string)),
-              tf.TensorSpec([batch_size], tf.string)),
+          token_out_shape, "with batch_size=%s" % batch_size)
-          token_out_shape,
-          "with batch_size=%s" % batch_size)
      # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
      if use_sp_model:
        self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
@@ -718,8 +785,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
        self.assertEqual(
            _result_shapes_in_tf_function(
                preprocess.tokenize_with_offsets,
-                tf.TensorSpec([batch_size], tf.string)),
+                tf.TensorSpec([batch_size], tf.string)), [token_out_shape] * 3,
-            [token_out_shape] * 3,
            "with batch_size=%s" % batch_size)
      self.assertEqual(
          _result_shapes_in_tf_function(
@@ -737,7 +803,9 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
  def test_reexport(self, use_sp_model):
    """Test that preprocess keeps working after another save/load cycle."""
    path1 = self._do_export(
-        ["d", "ef", "abc", "xy"], do_lower_case=True, default_seq_length=10,
+        ["d", "ef", "abc", "xy"],
+        do_lower_case=True,
+        default_seq_length=10,
        tokenize_with_offsets=False,
        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
        use_sp_model=use_sp_model)
@@ -752,35 +820,46 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
    inputs = tf.constant(["abc d ef", "ABC D EF d"])
    bert_inputs = model2(inputs)
-    self.assertAllEqual(bert_inputs["input_word_ids"],
+    self.assertAllEqual(
-                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
+        bert_inputs["input_word_ids"],
-                                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
+        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
-    self.assertAllEqual(bert_inputs["input_mask"],
+                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
-                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+    self.assertAllEqual(
-                                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
+        bert_inputs["input_mask"],
-    self.assertAllEqual(bert_inputs["input_type_ids"],
+        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
-                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
+    self.assertAllEqual(
+        bert_inputs["input_type_ids"],
+        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
  @parameterized.named_parameters(("Bert", True), ("Albert", False))
  def test_preprocessing_for_mlm(self, use_bert):
    """Combines both SavedModel types and TF.text helpers for MLM."""
    # Create the preprocessing SavedModel with a [MASK] token.
-    non_special_tokens = ["hello", "world",
+    non_special_tokens = [
-                          "nice", "movie", "great", "actors",
+        "hello", "world", "nice", "movie", "great", "actors", "quick", "fox",
-                          "quick", "fox", "lazy", "dog"]
+        "lazy", "dog"
-    preprocess = tf.saved_model.load(self._do_export(
+    ]
-        non_special_tokens, do_lower_case=True,
-        tokenize_with_offsets=use_bert,  # TODO(b/181866850): drop this.
+    preprocess = tf.saved_model.load(
-        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+        self._do_export(
-        add_mask_token=True, use_sp_model=not use_bert))
+            non_special_tokens,
+            do_lower_case=True,
+            tokenize_with_offsets=use_bert,  # TODO(b/181866850): drop this.
+            experimental_disable_assert=True,  # TODO(b/175369555): drop this.
+            add_mask_token=True,
+            use_sp_model=not use_bert))
    vocab_size = len(non_special_tokens) + (5 if use_bert else 7)
    # Create the encoder SavedModel with an .mlm subobject.
    hidden_size = 16
    num_hidden_layers = 2
    bert_config, encoder_config = _get_bert_config_or_encoder_config(
-        use_bert, hidden_size, num_hidden_layers, vocab_size)
+        use_bert_config=use_bert,
+        hidden_size=hidden_size,
+        num_hidden_layers=num_hidden_layers,
+        vocab_size=vocab_size)
    _, pretrainer = export_tfhub_lib._create_model(
        bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
@@ -814,8 +893,10 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
    self.assertEqual(mask_id, 4)
    # A batch of 3 segment pairs.
-    raw_segments = [tf.constant(["hello", "nice movie", "quick fox"]),
+    raw_segments = [
-                    tf.constant(["world", "great actors", "lazy dog"])]
+        tf.constant(["hello", "nice movie", "quick fox"]),
+        tf.constant(["world", "great actors", "lazy dog"])
+    ]
    batch_size = 3
    # Misc hyperparameters.
@@ -842,18 +923,18 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
             selection_rate=0.5,  # Adjusted for the short test examples.
             unselectable_ids=[start_of_sequence_id, end_of_segment_id]),
         mask_values_chooser=text.MaskValuesChooser(
-             vocab_size=vocab_size, mask_token=mask_id,
+             vocab_size=vocab_size,
+             mask_token=mask_id,
             # Always put [MASK] to have a predictable result.
-             mask_token_rate=1.0, random_token_rate=0.0))
+             mask_token_rate=1.0,
+             random_token_rate=0.0))
    # Pad to fixed-length Transformer encoder inputs.
-    input_word_ids, _ = text.pad_model_inputs(masked_input_ids,
+    input_word_ids, _ = text.pad_model_inputs(
-                                              seq_length,
+        masked_input_ids, seq_length, pad_value=padding_id)
-                                              pad_value=padding_id)
+    input_type_ids, input_mask = text.pad_model_inputs(
-    input_type_ids, input_mask = text.pad_model_inputs(segment_ids, seq_length,
+        segment_ids, seq_length, pad_value=0)
-                                                       pad_value=0)
+    masked_lm_positions, _ = text.pad_model_inputs(
-    masked_lm_positions, _ = text.pad_model_inputs(masked_lm_positions,
+        masked_lm_positions, max_selections_per_seq, pad_value=0)
-                                                   max_selections_per_seq,
-                                                   pad_value=0)
    masked_lm_positions = tf.cast(masked_lm_positions, tf.int32)
    num_predictions = int(tf.shape(masked_lm_positions)[1])
@@ -865,7 +946,8 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
        # [CLS] nice movie [SEP] great actors [SEP]
        [2, 7, 8, 3, 9, 10, 3, 0, 0, 0],
        # [CLS] brown fox [SEP] lazy dog [SEP]
-        [2, 11, 12, 3, 13, 14, 3, 0, 0, 0]])
+        [2, 11, 12, 3, 13, 14, 3, 0, 0, 0]
+    ])
    for i in range(batch_size):
      for j in range(num_predictions):
        k = int(masked_lm_positions[i, j])
@@ -896,15 +978,17 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
  def test_special_tokens_in_estimator(self, use_sp_model):
    """Tests getting special tokens without an Eager init context."""
-    preprocess_export_path = self._do_export(
+    preprocess_export_path = self._do_export(["d", "ef", "abc", "xy"],
-        ["d", "ef", "abc", "xy"], do_lower_case=True,
+                                             do_lower_case=True,
-        use_sp_model=use_sp_model, tokenize_with_offsets=False)
+                                             use_sp_model=use_sp_model,
+                                             tokenize_with_offsets=False)
    def _get_special_tokens_dict(obj):
      """Returns special tokens of restored tokenizer as Python values."""
      if tf.executing_eagerly():
-        special_tokens_numpy = {k: v.numpy()
+        special_tokens_numpy = {
-                                for k, v in obj.get_special_tokens_dict()}
+            k: v.numpy() for k, v in obj.get_special_tokens_dict()
+        }
      else:
        with tf.Graph().as_default():
          # This code expects `get_special_tokens_dict()` to be a tf.function
@@ -913,8 +997,10 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
          special_tokens_tensors = obj.get_special_tokens_dict()
          with tf.compat.v1.Session() as sess:
            special_tokens_numpy = sess.run(special_tokens_tensors)
-      return {k: v.item()  # Numpy to Python.
+      return {
-              for k, v in special_tokens_numpy.items()}
+          k: v.item()  # Numpy to Python.
+          for k, v in special_tokens_numpy.items()
+      }
    def input_fn():
      self.assertFalse(tf.executing_eagerly())
@@ -927,7 +1013,8 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
        self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
      tokens = tokenize(sentences)
      packed_inputs = layers.BertPackInputs(
-          4, special_tokens_dict=special_tokens_dict)(tokens)
+          4, special_tokens_dict=special_tokens_dict)(
+              tokens)
      preprocessing = tf.keras.Model(sentences, packed_inputs)
      # Map the dataset.
      ds = tf.data.Dataset.from_tensors(
@@ -937,22 +1024,22 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
    def model_fn(features, labels, mode):
      del labels  # Unused.
-      return tf.estimator.EstimatorSpec(mode=mode,
+      return tf.estimator.EstimatorSpec(
-                                        predictions=features["input_word_ids"])
+          mode=mode, predictions=features["input_word_ids"])
    estimator = tf.estimator.Estimator(model_fn=model_fn)
    outputs = list(estimator.predict(input_fn))
-    self.assertAllEqual(outputs, np.array([[2, 6, 3, 0],
+    self.assertAllEqual(outputs, np.array([[2, 6, 3, 0], [2, 4, 5, 3]]))
-                                           [2, 4, 5, 3]]))
  # TODO(b/175369555): Remove that code and its test.
  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
  def test_check_no_assert(self, use_sp_model):
    """Tests the self-check during export without assertions."""
-    preprocess_export_path = self._do_export(
+    preprocess_export_path = self._do_export(["d", "ef", "abc", "xy"],
-        ["d", "ef", "abc", "xy"], do_lower_case=True,
+                                             do_lower_case=True,
-        use_sp_model=use_sp_model, tokenize_with_offsets=False,
+                                             use_sp_model=use_sp_model,
-        experimental_disable_assert=False)
+                                             tokenize_with_offsets=False,
+                                             experimental_disable_assert=False)
    with self.assertRaisesRegex(AssertionError,
                                r"failed to suppress \d+ Assert ops"):
      export_tfhub_lib._check_no_assert(preprocess_export_path)
@@ -963,8 +1050,8 @@ def _result_shapes_in_tf_function(fn, *args, **kwargs):
  Args:
    fn: A callable.
-    *args: TensorSpecs for Tensor-valued arguments and actual values
+    *args: TensorSpecs for Tensor-valued arguments and actual values for
-      for Python-valued arguments to fn.
+      Python-valued arguments to fn.
    **kwargs: Same for keyword arguments.
  Returns:

--- a/official/projects/assemblenet/configs/assemblenet.py
+++ b/official/projects/assemblenet/configs/assemblenet.py
@@ -40,9 +40,9 @@ from typing import List, Optional, Tuple
 from official.core import config_definitions as cfg
 from official.core import exp_factory
 from official.modeling import hyperparams
-from official.vision.beta.configs import backbones_3d
+from official.vision.configs import backbones_3d
-from official.vision.beta.configs import common
+from official.vision.configs import common
-from official.vision.beta.configs import video_classification
+from official.vision.configs import video_classification
 @dataclasses.dataclass

--- a/official/projects/assemblenet/configs/assemblenet_test.py
+++ b/official/projects/assemblenet/configs/assemblenet_test.py
@@ -18,7 +18,7 @@ import tensorflow as tf
 from official.core import config_definitions as cfg
 from official.core import exp_factory
 from official.projects.assemblenet.configs import assemblenet
-from official.vision.beta.configs import video_classification as exp_cfg
+from official.vision.configs import video_classification as exp_cfg
 class AssemblenetTest(tf.test.TestCase, parameterized.TestCase):

--- a/official/projects/assemblenet/modeling/assemblenet.py
+++ b/official/projects/assemblenet/modeling/assemblenet.py
@@ -57,8 +57,8 @@ import tensorflow as tf
 from official.modeling import hyperparams
 from official.projects.assemblenet.configs import assemblenet as cfg
 from official.projects.assemblenet.modeling import rep_flow_2d_layer as rf
-from official.vision.beta.modeling import factory_3d as model_factory
+from official.vision.modeling import factory_3d as model_factory
-from official.vision.beta.modeling.backbones import factory as backbone_factory
+from official.vision.modeling.backbones import factory as backbone_factory
 layers = tf.keras.layers
 intermediate_channel_size = [64, 128, 256, 512]

--- a/official/projects/assemblenet/modeling/assemblenet_plus.py
+++ b/official/projects/assemblenet/modeling/assemblenet_plus.py
@@ -64,8 +64,8 @@ from official.modeling import hyperparams
 from official.projects.assemblenet.configs import assemblenet as cfg
 from official.projects.assemblenet.modeling import assemblenet as asn
 from official.projects.assemblenet.modeling import rep_flow_2d_layer as rf
-from official.vision.beta.modeling import factory_3d as model_factory
+from official.vision.modeling import factory_3d as model_factory
-from official.vision.beta.modeling.backbones import factory as backbone_factory
+from official.vision.modeling.backbones import factory as backbone_factory
 layers = tf.keras.layers

--- a/official/projects/assemblenet/train.py
+++ b/official/projects/assemblenet/train.py
@@ -29,9 +29,6 @@ from absl import flags
 from absl import logging
 import gin
-# pylint: disable=unused-import
-from official.common import registry_imports
-# pylint: enable=unused-import
 from official.common import distribute_utils
 from official.common import flags as tfm_flags
 from official.core import task_factory
@@ -42,6 +39,7 @@ from official.modeling import performance
 from official.projects.assemblenet.configs import assemblenet as asn_configs
 from official.projects.assemblenet.modeling import assemblenet as asn
 from official.projects.assemblenet.modeling import assemblenet_plus as asnp
+from official.vision import registry_imports
 # pylint: enable=unused-import
 FLAGS = flags.FLAGS

--- a/official/projects/assemblenet/train_test.py
+++ b/official/projects/assemblenet/train_test.py
@@ -22,7 +22,7 @@ from absl import logging
 from absl.testing import flagsaver
 import tensorflow as tf
 from official.projects.assemblenet import train as train_lib
-from official.vision.beta.dataloaders import tfexample_utils
+from official.vision.dataloaders import tfexample_utils
 FLAGS = flags.FLAGS

--- a/official/projects/detr/dataloaders/coco.py
+++ b/official/projects/detr/dataloaders/coco.py
@@ -20,8 +20,8 @@ import tensorflow as tf
 from official.core import config_definitions as cfg
 from official.core import input_reader
-from official.vision.beta.ops import box_ops
+from official.vision.ops import box_ops
-from official.vision.beta.ops import preprocess_ops
+from official.vision.ops import preprocess_ops
 @dataclasses.dataclass