Update code to v2.11.0

32e4ca51 · qianyj · 9485aa1d · 71060f67 · 32e4ca51 · 32e4ca51
Commit 32e4ca51 authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/projects/detr/ops/matchers_test.py
+++ b/official/projects/detr/ops/matchers_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensorflow_models.official.projects.detr.ops.matchers."""
+
+import numpy as np
+from scipy import optimize
+import tensorflow as tf
+
+from official.projects.detr.ops import matchers
+
+
+class MatchersOpsTest(tf.test.TestCase):
+
+  def testLinearSumAssignment(self):
+    """Check a simple 2D test case of the Linear Sum Assignment problem.
+
+    Ensures that the implementation of the matching algorithm is correct
+    and functional on TPUs.
+    """
+    cost_matrix = np.array([[[4, 1, 3], [2, 0, 5], [3, 2, 2]]],
+                           dtype=np.float32)
+    _, adjacency_matrix = matchers.hungarian_matching(tf.constant(cost_matrix))
+    adjacency_output = adjacency_matrix.numpy()
+
+    correct_output = np.array([
+        [0, 1, 0],
+        [1, 0, 0],
+        [0, 0, 1],
+    ], dtype=bool)
+    self.assertAllEqual(adjacency_output[0], correct_output)
+
+  def testBatchedLinearSumAssignment(self):
+    """Check a batched case of the Linear Sum Assignment Problem.
+
+    Ensures that a correct solution is found for all inputted problems within
+    a batch.
+    """
+    cost_matrix = np.array([
+        [[4, 1, 3], [2, 0, 5], [3, 2, 2]],
+        [[1, 4, 3], [0, 2, 5], [2, 3, 2]],
+        [[1, 3, 4], [0, 5, 2], [2, 2, 3]],
+    ],
+                           dtype=np.float32)
+    _, adjacency_matrix = matchers.hungarian_matching(tf.constant(cost_matrix))
+    adjacency_output = adjacency_matrix.numpy()
+
+    # Hand solved correct output for the linear sum assignment problem
+    correct_output = np.array([
+        [[0, 1, 0], [1, 0, 0], [0, 0, 1]],
+        [[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+        [[1, 0, 0], [0, 0, 1], [0, 1, 0]],
+    ],
+                              dtype=bool)
+    self.assertAllClose(adjacency_output, correct_output)
+
+  def testMaximumBipartiteMatching(self):
+    """Check that the maximum bipartite match assigns the correct numbers."""
+    adj_matrix = tf.cast([[
+        [1, 0, 0, 0, 1],
+        [0, 1, 0, 1, 0],
+        [0, 0, 1, 0, 0],
+        [0, 1, 0, 0, 0],
+        [1, 0, 0, 0, 0],
+    ]], tf.bool)
+    _, assignment = matchers._maximum_bipartite_matching(adj_matrix)
+    self.assertEqual(np.sum(assignment.numpy()), 5)
+
+  def testAssignmentMatchesScipy(self):
+    """Check that the Linear Sum Assignment matches the Scipy implementation."""
+    batch_size, num_elems = 2, 25
+    weights = tf.random.uniform((batch_size, num_elems, num_elems),
+                                minval=0.,
+                                maxval=1.)
+    weights, assignment = matchers.hungarian_matching(weights)
+
+    for idx in range(batch_size):
+      _, scipy_assignment = optimize.linear_sum_assignment(weights.numpy()[idx])
+      hungarian_assignment = np.where(assignment.numpy()[idx])[1]
+
+      self.assertAllEqual(hungarian_assignment, scipy_assignment)
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/projects/detr/optimization.py
+++ b/official/projects/detr/optimization.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Customized optimizer to match paper results."""
+
+import dataclasses
+import tensorflow as tf
+from official.modeling import optimization
+from official.nlp import optimization as nlp_optimization
+
+
+@dataclasses.dataclass
+class DETRAdamWConfig(optimization.AdamWeightDecayConfig):
+  pass
+
+
+@dataclasses.dataclass
+class OptimizerConfig(optimization.OptimizerConfig):
+  detr_adamw: DETRAdamWConfig = DETRAdamWConfig()
+
+
+@dataclasses.dataclass
+class OptimizationConfig(optimization.OptimizationConfig):
+  """Configuration for optimizer and learning rate schedule.
+
+  Attributes:
+    optimizer: optimizer oneof config.
+    ema: optional exponential moving average optimizer config, if specified, ema
+      optimizer will be used.
+    learning_rate: learning rate oneof config.
+    warmup: warmup oneof config.
+  """
+  optimizer: OptimizerConfig = OptimizerConfig()
+
+
+# TODO(frederickliu): figure out how to make this configuable.
+# TODO(frederickliu): Study if this is needed.
+class _DETRAdamW(nlp_optimization.AdamWeightDecay):
+  """Custom AdamW to support different lr scaling for backbone.
+
+  The code is copied from AdamWeightDecay and Adam with learning scaling.
+  """
+
+  def _resource_apply_dense(self, grad, var, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    apply_state = kwargs['apply_state']
+    if 'detr' not in var.name:
+      lr_t *= 0.1
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      var_device, var_dtype = var.device, var.dtype.base_dtype
+      coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                      or self._fallback_apply_state(var_device, var_dtype))
+
+      m = self.get_slot(var, 'm')
+      v = self.get_slot(var, 'v')
+      lr = coefficients[
+          'lr_t'] * 0.1 if 'detr' not in var.name else coefficients['lr_t']
+
+      if not self.amsgrad:
+        return tf.raw_ops.ResourceApplyAdam(
+            var=var.handle,
+            m=m.handle,
+            v=v.handle,
+            beta1_power=coefficients['beta_1_power'],
+            beta2_power=coefficients['beta_2_power'],
+            lr=lr,
+            beta1=coefficients['beta_1_t'],
+            beta2=coefficients['beta_2_t'],
+            epsilon=coefficients['epsilon'],
+            grad=grad,
+            use_locking=self._use_locking)
+      else:
+        vhat = self.get_slot(var, 'vhat')
+        return tf.raw_ops.ResourceApplyAdamWithAmsgrad(
+            var=var.handle,
+            m=m.handle,
+            v=v.handle,
+            vhat=vhat.handle,
+            beta1_power=coefficients['beta_1_power'],
+            beta2_power=coefficients['beta_2_power'],
+            lr=lr,
+            beta1=coefficients['beta_1_t'],
+            beta2=coefficients['beta_2_t'],
+            epsilon=coefficients['epsilon'],
+            grad=grad,
+            use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    apply_state = kwargs['apply_state']
+    if 'detr' not in var.name:
+      lr_t *= 0.1
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      var_device, var_dtype = var.device, var.dtype.base_dtype
+      coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                      or self._fallback_apply_state(var_device, var_dtype))
+
+      # m_t = beta1 * m + (1 - beta1) * g_t
+      m = self.get_slot(var, 'm')
+      m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
+      m_t = tf.compat.v1.assign(m, m * coefficients['beta_1_t'],
+                                use_locking=self._use_locking)
+      with tf.control_dependencies([m_t]):
+        m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+
+      # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+      v = self.get_slot(var, 'v')
+      v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
+      v_t = tf.compat.v1.assign(v, v * coefficients['beta_2_t'],
+                                use_locking=self._use_locking)
+      with tf.control_dependencies([v_t]):
+        v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+      lr = coefficients[
+          'lr_t'] * 0.1 if 'detr' not in var.name else coefficients['lr_t']
+      if not self.amsgrad:
+        v_sqrt = tf.sqrt(v_t)
+        var_update = tf.compat.v1.assign_sub(
+            var, lr * m_t / (v_sqrt + coefficients['epsilon']),
+            use_locking=self._use_locking)
+        return tf.group(*[var_update, m_t, v_t])
+      else:
+        v_hat = self.get_slot(var, 'vhat')
+        v_hat_t = tf.maximum(v_hat, v_t)
+        with tf.control_dependencies([v_hat_t]):
+          v_hat_t = tf.compat.v1.assign(
+              v_hat, v_hat_t, use_locking=self._use_locking)
+        v_hat_sqrt = tf.sqrt(v_hat_t)
+        var_update = tf.compat.v1.assign_sub(
+            var,
+            lr* m_t / (v_hat_sqrt + coefficients['epsilon']),
+            use_locking=self._use_locking)
+        return tf.group(*[var_update, m_t, v_t, v_hat_t])
+
+optimization.register_optimizer_cls('detr_adamw', _DETRAdamW)
--- a/official/projects/detr/tasks/detection.py
+++ b/official/projects/detr/tasks/detection.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DETR detection task definition."""
+from typing import Optional
+
+from absl import logging
+import tensorflow as tf
+
+from official.common import dataset_fn
+from official.core import base_task
+from official.core import task_factory
+from official.projects.detr.configs import detr as detr_cfg
+from official.projects.detr.dataloaders import coco
+from official.projects.detr.dataloaders import detr_input
+from official.projects.detr.modeling import detr
+from official.projects.detr.ops import matchers
+from official.vision.dataloaders import input_reader_factory
+from official.vision.dataloaders import tf_example_decoder
+from official.vision.dataloaders import tfds_factory
+from official.vision.dataloaders import tf_example_label_map_decoder
+from official.vision.evaluation import coco_evaluator
+from official.vision.modeling import backbones
+from official.vision.ops import box_ops
+
+
+@task_factory.register_task_cls(detr_cfg.DetrTask)
+class DetectionTask(base_task.Task):
+  """A single-replica view of training procedure.
+
+  DETR task provides artifacts for training/evalution procedures, including
+  loading/iterating over Datasets, initializing the model, calculating the loss,
+  post-processing, and customized metrics with reduction.
+  """
+
+  def build_model(self):
+    """Build DETR model."""
+
+    input_specs = tf.keras.layers.InputSpec(shape=[None] +
+                                            self._task_config.model.input_size)
+
+    backbone = backbones.factory.build_backbone(
+        input_specs=input_specs,
+        backbone_config=self._task_config.model.backbone,
+        norm_activation_config=self._task_config.model.norm_activation)
+
+    model = detr.DETR(backbone,
+                      self._task_config.model.backbone_endpoint_name,
+                      self._task_config.model.num_queries,
+                      self._task_config.model.hidden_size,
+                      self._task_config.model.num_classes,
+                      self._task_config.model.num_encoder_layers,
+                      self._task_config.model.num_decoder_layers)
+    return model
+
+  def initialize(self, model: tf.keras.Model):
+    """Loading pretrained checkpoint."""
+    if not self._task_config.init_checkpoint:
+      return
+
+    ckpt_dir_or_file = self._task_config.init_checkpoint
+
+    # Restoring checkpoint.
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+
+    if self._task_config.init_checkpoint_modules == 'all':
+      ckpt = tf.train.Checkpoint(**model.checkpoint_items)
+      status = ckpt.restore(ckpt_dir_or_file)
+      status.assert_consumed()
+    elif self._task_config.init_checkpoint_modules == 'backbone':
+      ckpt = tf.train.Checkpoint(backbone=model.backbone)
+      status = ckpt.restore(ckpt_dir_or_file)
+      status.expect_partial().assert_existing_objects_matched()
+
+    logging.info('Finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
+
+  def build_inputs(self,
+                   params,
+                   input_context: Optional[tf.distribute.InputContext] = None):
+    """Build input dataset."""
+    if isinstance(params, coco.COCODataConfig):
+      dataset = coco.COCODataLoader(params).load(input_context)
+    else:
+      if params.tfds_name:
+        decoder = tfds_factory.get_detection_decoder(params.tfds_name)
+      else:
+        decoder_cfg = params.decoder.get()
+        if params.decoder.type == 'simple_decoder':
+          decoder = tf_example_decoder.TfExampleDecoder(
+              regenerate_source_id=decoder_cfg.regenerate_source_id)
+        elif params.decoder.type == 'label_map_decoder':
+          decoder = tf_example_label_map_decoder.TfExampleDecoderLabelMap(
+              label_map=decoder_cfg.label_map,
+              regenerate_source_id=decoder_cfg.regenerate_source_id)
+        else:
+          raise ValueError('Unknown decoder type: {}!'.format(
+              params.decoder.type))
+
+      parser = detr_input.Parser(
+          class_offset=self._task_config.losses.class_offset,
+          output_size=self._task_config.model.input_size[:2],
+      )
+
+      reader = input_reader_factory.input_reader_generator(
+          params,
+          dataset_fn=dataset_fn.pick_dataset_fn(params.file_type),
+          decoder_fn=decoder.decode,
+          parser_fn=parser.parse_fn(params.is_training))
+      dataset = reader.read(input_context=input_context)
+
+    return dataset
+
+  def _compute_cost(self, cls_outputs, box_outputs, cls_targets, box_targets):
+    # Approximate classification cost with 1 - prob[target class].
+    # The 1 is a constant that doesn't change the matching, it can be ommitted.
+    # background: 0
+    cls_cost = self._task_config.losses.lambda_cls * tf.gather(
+        -tf.nn.softmax(cls_outputs), cls_targets, batch_dims=1, axis=-1)
+
+    # Compute the L1 cost between boxes,
+    paired_differences = self._task_config.losses.lambda_box * tf.abs(
+        tf.expand_dims(box_outputs, 2) - tf.expand_dims(box_targets, 1))
+    box_cost = tf.reduce_sum(paired_differences, axis=-1)
+
+    # Compute the giou cost betwen boxes
+    giou_cost = self._task_config.losses.lambda_giou * -box_ops.bbox_generalized_overlap(
+        box_ops.cycxhw_to_yxyx(box_outputs),
+        box_ops.cycxhw_to_yxyx(box_targets))
+
+    total_cost = cls_cost + box_cost + giou_cost
+
+    max_cost = (
+        self._task_config.losses.lambda_cls * 0.0 +
+        self._task_config.losses.lambda_box * 4. +
+        self._task_config.losses.lambda_giou * 0.0)
+
+    # Set pads to large constant
+    valid = tf.expand_dims(
+        tf.cast(tf.not_equal(cls_targets, 0), dtype=total_cost.dtype), axis=1)
+    total_cost = (1 - valid) * max_cost + valid * total_cost
+
+    # Set inf of nan to large constant
+    total_cost = tf.where(
+        tf.logical_or(tf.math.is_nan(total_cost), tf.math.is_inf(total_cost)),
+        max_cost * tf.ones_like(total_cost, dtype=total_cost.dtype),
+        total_cost)
+
+    return total_cost
+
+  def build_losses(self, outputs, labels, aux_losses=None):
+    """Build DETR losses."""
+    cls_outputs = outputs['cls_outputs']
+    box_outputs = outputs['box_outputs']
+    cls_targets = labels['classes']
+    box_targets = labels['boxes']
+
+    cost = self._compute_cost(
+        cls_outputs, box_outputs, cls_targets, box_targets)
+
+    _, indices = matchers.hungarian_matching(cost)
+    indices = tf.stop_gradient(indices)
+
+    target_index = tf.math.argmax(indices, axis=1)
+    cls_assigned = tf.gather(cls_outputs, target_index, batch_dims=1, axis=1)
+    box_assigned = tf.gather(box_outputs, target_index, batch_dims=1, axis=1)
+
+    background = tf.equal(cls_targets, 0)
+    num_boxes = tf.reduce_sum(
+        tf.cast(tf.logical_not(background), tf.float32), axis=-1)
+
+    # Down-weight background to account for class imbalance.
+    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        labels=cls_targets, logits=cls_assigned)
+    cls_loss = self._task_config.losses.lambda_cls * tf.where(
+        background, self._task_config.losses.background_cls_weight * xentropy,
+        xentropy)
+    cls_weights = tf.where(
+        background,
+        self._task_config.losses.background_cls_weight * tf.ones_like(cls_loss),
+        tf.ones_like(cls_loss))
+
+    # Box loss is only calculated on non-background class.
+    l_1 = tf.reduce_sum(tf.abs(box_assigned - box_targets), axis=-1)
+    box_loss = self._task_config.losses.lambda_box * tf.where(
+        background, tf.zeros_like(l_1), l_1)
+
+    # Giou loss is only calculated on non-background class.
+    giou = tf.linalg.diag_part(1.0 - box_ops.bbox_generalized_overlap(
+        box_ops.cycxhw_to_yxyx(box_assigned),
+        box_ops.cycxhw_to_yxyx(box_targets)
+        ))
+    giou_loss = self._task_config.losses.lambda_giou * tf.where(
+        background, tf.zeros_like(giou), giou)
+
+    # Consider doing all reduce once in train_step to speed up.
+    num_boxes_per_replica = tf.reduce_sum(num_boxes)
+    cls_weights_per_replica = tf.reduce_sum(cls_weights)
+    replica_context = tf.distribute.get_replica_context()
+    num_boxes_sum, cls_weights_sum = replica_context.all_reduce(
+        tf.distribute.ReduceOp.SUM,
+        [num_boxes_per_replica, cls_weights_per_replica])
+    cls_loss = tf.math.divide_no_nan(
+        tf.reduce_sum(cls_loss), cls_weights_sum)
+    box_loss = tf.math.divide_no_nan(
+        tf.reduce_sum(box_loss), num_boxes_sum)
+    giou_loss = tf.math.divide_no_nan(
+        tf.reduce_sum(giou_loss), num_boxes_sum)
+
+    aux_losses = tf.add_n(aux_losses) if aux_losses else 0.0
+
+    total_loss = cls_loss + box_loss + giou_loss + aux_losses
+    return total_loss, cls_loss, box_loss, giou_loss
+
+  def build_metrics(self, training=True):
+    """Build detection metrics."""
+    metrics = []
+    metric_names = ['cls_loss', 'box_loss', 'giou_loss']
+    for name in metric_names:
+      metrics.append(tf.keras.metrics.Mean(name, dtype=tf.float32))
+
+    if not training:
+      self.coco_metric = coco_evaluator.COCOEvaluator(
+          annotation_file=self._task_config.annotation_file,
+          include_mask=False,
+          need_rescale_bboxes=True,
+          per_category_metrics=self._task_config.per_category_metrics)
+    return metrics
+
+  def train_step(self, inputs, model, optimizer, metrics=None):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    features, labels = inputs
+    with tf.GradientTape() as tape:
+      outputs = model(features, training=True)
+
+      loss = 0.0
+      cls_loss = 0.0
+      box_loss = 0.0
+      giou_loss = 0.0
+
+      for output in outputs:
+        # Computes per-replica loss.
+        layer_loss, layer_cls_loss, layer_box_loss, layer_giou_loss = self.build_losses(
+            outputs=output, labels=labels, aux_losses=model.losses)
+        loss += layer_loss
+        cls_loss += layer_cls_loss
+        box_loss += layer_box_loss
+        giou_loss += layer_giou_loss
+
+      # Consider moving scaling logic from build_losses to here.
+      scaled_loss = loss
+      # For mixed_precision policy, when LossScaleOptimizer is used, loss is
+      # scaled for numerical stability.
+      if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
+
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    # Scales back gradient when LossScaleOptimizer is used.
+    if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+      grads = optimizer.get_unscaled_gradients(grads)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+
+    # Multiply for logging.
+    # Since we expect the gradient replica sum to happen in the optimizer,
+    # the loss is scaled with global num_boxes and weights.
+    # To have it more interpretable/comparable we scale it back when logging.
+    num_replicas_in_sync = tf.distribute.get_strategy().num_replicas_in_sync
+    loss *= num_replicas_in_sync
+    cls_loss *= num_replicas_in_sync
+    box_loss *= num_replicas_in_sync
+    giou_loss *= num_replicas_in_sync
+
+    # Trainer class handles loss metric for you.
+    logs = {self.loss: loss}
+
+    all_losses = {
+        'cls_loss': cls_loss,
+        'box_loss': box_loss,
+        'giou_loss': giou_loss,
+    }
+
+    # Metric results will be added to logs for you.
+    if metrics:
+      for m in metrics:
+        m.update_state(all_losses[m.name])
+    return logs
+
+  def validation_step(self, inputs, model, metrics=None):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    features, labels = inputs
+
+    outputs = model(features, training=False)[-1]
+    loss, cls_loss, box_loss, giou_loss = self.build_losses(
+        outputs=outputs, labels=labels, aux_losses=model.losses)
+
+    # Multiply for logging.
+    # Since we expect the gradient replica sum to happen in the optimizer,
+    # the loss is scaled with global num_boxes and weights.
+    # To have it more interpretable/comparable we scale it back when logging.
+    num_replicas_in_sync = tf.distribute.get_strategy().num_replicas_in_sync
+    loss *= num_replicas_in_sync
+    cls_loss *= num_replicas_in_sync
+    box_loss *= num_replicas_in_sync
+    giou_loss *= num_replicas_in_sync
+
+    # Evaluator class handles loss metric for you.
+    logs = {self.loss: loss}
+
+    predictions = {
+        'detection_boxes':
+                box_ops.cycxhw_to_yxyx(outputs['box_outputs'])
+                * tf.expand_dims(
+                    tf.concat([
+                        labels['image_info'][:, 1:2, 0],
+                        labels['image_info'][:, 1:2, 1],
+                        labels['image_info'][:, 1:2, 0],
+                        labels['image_info'][:, 1:2, 1]
+                    ],
+                              axis=1),
+                    axis=1),
+        'detection_scores':
+            tf.math.reduce_max(
+                tf.nn.softmax(outputs['cls_outputs'])[:, :, 1:], axis=-1),
+        'detection_classes':
+            tf.math.argmax(outputs['cls_outputs'][:, :, 1:], axis=-1) + 1,
+        # Fix this. It's not being used at the moment.
+        'num_detections': tf.reduce_sum(
+            tf.cast(
+                tf.math.greater(tf.math.reduce_max(
+                    outputs['cls_outputs'], axis=-1), 0), tf.int32), axis=-1),
+        'source_id': labels['id'],
+        'image_info': labels['image_info']
+    }
+    ground_truths = {
+        'source_id': labels['id'],
+        'height': labels['image_info'][:, 0:1, 0],
+        'width': labels['image_info'][:, 0:1, 1],
+        'num_detections': tf.reduce_sum(
+            tf.cast(tf.math.greater(labels['classes'], 0), tf.int32), axis=-1),
+        'boxes': labels['gt_boxes'],
+        'classes': labels['classes'],
+        'is_crowds': labels['is_crowd']
+    }
+    logs.update({'predictions': predictions,
+                 'ground_truths': ground_truths})
+
+    all_losses = {
+        'cls_loss': cls_loss,
+        'box_loss': box_loss,
+        'giou_loss': giou_loss,
+    }
+
+    # Metric results will be added to logs for you.
+    if metrics:
+      for m in metrics:
+        m.update_state(all_losses[m.name])
+    return logs
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    if state is None:
+      self.coco_metric.reset_states()
+      state = self.coco_metric
+
+    state.update_state(
+        step_outputs['ground_truths'],
+        step_outputs['predictions'])
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
+    return aggregated_logs.result()
--- a/official/projects/detr/tasks/detection_test.py
+++ b/official/projects/detr/tasks/detection_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for detection."""
+
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+from official.projects.detr import optimization
+from official.projects.detr.configs import detr as detr_cfg
+from official.projects.detr.dataloaders import coco
+from official.projects.detr.tasks import detection
+from official.vision.configs import backbones
+
+
+_NUM_EXAMPLES = 10
+
+
+def _gen_fn():
+  h = np.random.randint(0, 300)
+  w = np.random.randint(0, 300)
+  num_boxes = np.random.randint(0, 50)
+  return {
+      'image': np.ones(shape=(h, w, 3), dtype=np.uint8),
+      'image/id': np.random.randint(0, 100),
+      'image/filename': 'test',
+      'objects': {
+          'is_crowd': np.ones(shape=(num_boxes), dtype=np.bool),
+          'bbox': np.ones(shape=(num_boxes, 4), dtype=np.float32),
+          'label': np.ones(shape=(num_boxes), dtype=np.int64),
+          'id': np.ones(shape=(num_boxes), dtype=np.int64),
+          'area': np.ones(shape=(num_boxes), dtype=np.int64),
+      }
+  }
+
+
+def _as_dataset(self, *args, **kwargs):
+  del args
+  del kwargs
+  return tf.data.Dataset.from_generator(
+      lambda: (_gen_fn() for i in range(_NUM_EXAMPLES)),
+      output_types=self.info.features.dtype,
+      output_shapes=self.info.features.shape,
+  )
+
+
+class DetectionTest(tf.test.TestCase):
+
+  def test_train_step(self):
+    config = detr_cfg.DetrTask(
+        model=detr_cfg.Detr(
+            input_size=[1333, 1333, 3],
+            num_encoder_layers=1,
+            num_decoder_layers=1,
+            num_classes=81,
+            backbone=backbones.Backbone(
+                type='resnet',
+                resnet=backbones.ResNet(model_id=10, bn_trainable=False))
+        ),
+        train_data=coco.COCODataConfig(
+            tfds_name='coco/2017',
+            tfds_split='validation',
+            is_training=True,
+            global_batch_size=2,
+        ))
+    with tfds.testing.mock_data(as_dataset_fn=_as_dataset):
+      task = detection.DetectionTask(config)
+      model = task.build_model()
+      dataset = task.build_inputs(config.train_data)
+      iterator = iter(dataset)
+      opt_cfg = optimization.OptimizationConfig({
+          'optimizer': {
+              'type': 'detr_adamw',
+              'detr_adamw': {
+                  'weight_decay_rate': 1e-4,
+                  'global_clipnorm': 0.1,
+              }
+          },
+          'learning_rate': {
+              'type': 'stepwise',
+              'stepwise': {
+                  'boundaries': [120000],
+                  'values': [0.0001, 1.0e-05]
+              }
+          },
+      })
+      optimizer = detection.DetectionTask.create_optimizer(opt_cfg)
+      task.train_step(next(iterator), model, optimizer)
+
+  def test_validation_step(self):
+    config = detr_cfg.DetrTask(
+        model=detr_cfg.Detr(
+            input_size=[1333, 1333, 3],
+            num_encoder_layers=1,
+            num_decoder_layers=1,
+            num_classes=81,
+            backbone=backbones.Backbone(
+                type='resnet',
+                resnet=backbones.ResNet(model_id=10, bn_trainable=False))
+        ),
+        validation_data=coco.COCODataConfig(
+            tfds_name='coco/2017',
+            tfds_split='validation',
+            is_training=False,
+            global_batch_size=2,
+        ))
+
+    with tfds.testing.mock_data(as_dataset_fn=_as_dataset):
+      task = detection.DetectionTask(config)
+      model = task.build_model()
+      metrics = task.build_metrics(training=False)
+      dataset = task.build_inputs(config.validation_data)
+      iterator = iter(dataset)
+      logs = task.validation_step(next(iterator), model, metrics)
+      state = task.aggregate_logs(step_outputs=logs)
+      task.reduce_aggregated_logs(state)
+
+
+class DetectionTFDSTest(tf.test.TestCase):
+
+  def test_train_step(self):
+    config = detr_cfg.DetrTask(
+        model=detr_cfg.Detr(
+            input_size=[1333, 1333, 3],
+            num_encoder_layers=1,
+            num_decoder_layers=1,
+            backbone=backbones.Backbone(
+                type='resnet',
+                resnet=backbones.ResNet(model_id=10, bn_trainable=False))
+        ),
+        losses=detr_cfg.Losses(class_offset=1),
+        train_data=detr_cfg.DataConfig(
+            tfds_name='coco/2017',
+            tfds_split='validation',
+            is_training=True,
+            global_batch_size=2,
+        ))
+    with tfds.testing.mock_data(as_dataset_fn=_as_dataset):
+      task = detection.DetectionTask(config)
+      model = task.build_model()
+      dataset = task.build_inputs(config.train_data)
+      iterator = iter(dataset)
+      opt_cfg = optimization.OptimizationConfig({
+          'optimizer': {
+              'type': 'detr_adamw',
+              'detr_adamw': {
+                  'weight_decay_rate': 1e-4,
+                  'global_clipnorm': 0.1,
+              }
+          },
+          'learning_rate': {
+              'type': 'stepwise',
+              'stepwise': {
+                  'boundaries': [120000],
+                  'values': [0.0001, 1.0e-05]
+              }
+          },
+      })
+      optimizer = detection.DetectionTask.create_optimizer(opt_cfg)
+      task.train_step(next(iterator), model, optimizer)
+
+  def test_validation_step(self):
+    config = detr_cfg.DetrTask(
+        model=detr_cfg.Detr(
+            input_size=[1333, 1333, 3],
+            num_encoder_layers=1,
+            num_decoder_layers=1,
+            backbone=backbones.Backbone(
+                type='resnet',
+                resnet=backbones.ResNet(model_id=10, bn_trainable=False))
+        ),
+        losses=detr_cfg.Losses(class_offset=1),
+        validation_data=detr_cfg.DataConfig(
+            tfds_name='coco/2017',
+            tfds_split='validation',
+            is_training=False,
+            global_batch_size=2,
+        ))
+
+    with tfds.testing.mock_data(as_dataset_fn=_as_dataset):
+      task = detection.DetectionTask(config)
+      model = task.build_model()
+      metrics = task.build_metrics(training=False)
+      dataset = task.build_inputs(config.validation_data)
+      iterator = iter(dataset)
+      logs = task.validation_step(next(iterator), model, metrics)
+      state = task.aggregate_logs(step_outputs=logs)
+      task.reduce_aggregated_logs(state)
+
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/projects/detr/train.py
+++ b/official/projects/detr/train.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TensorFlow Model Garden Vision training driver."""
+
+from absl import app
+from absl import flags
+import gin
+
+from official.common import distribute_utils
+from official.common import flags as tfm_flags
+from official.core import task_factory
+from official.core import train_lib
+from official.core import train_utils
+from official.modeling import performance
+# pylint: disable=unused-import
+from official.projects.detr.configs import detr
+from official.projects.detr.tasks import detection
+# pylint: enable=unused-import
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+  params = train_utils.parse_configuration(FLAGS)
+  model_dir = FLAGS.model_dir
+  if 'train' in FLAGS.mode:
+    # Pure eval modes do not output yaml files. Otherwise continuous eval job
+    # may race against the train job for writing the same file.
+    train_utils.serialize_config(params, model_dir)
+
+  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+  # can have significant impact on model speeds by utilizing float16 in case of
+  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
+  # dtype is float16
+  if params.runtime.mixed_precision_dtype:
+    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
+  distribution_strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu)
+  with distribution_strategy.scope():
+    task = task_factory.get_task(params.task, logging_dir=model_dir)
+
+  train_lib.run_experiment(
+      distribution_strategy=distribution_strategy,
+      task=task,
+      mode=FLAGS.mode,
+      params=params,
+      model_dir=model_dir)
+
+  train_utils.save_gin_config(FLAGS.mode, model_dir)
+
+if __name__ == '__main__':
+  tfm_flags.define_flags()
+  flags.mark_flags_as_required(['experiment', 'mode', 'model_dir'])
+  app.run(main)
--- a/official/projects/edgetpu/nlp/__init__.py
+++ b/official/projects/edgetpu/nlp/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/projects/edgetpu/nlp/configs/__init__.py
+++ b/official/projects/edgetpu/nlp/configs/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/projects/edgetpu/nlp/configs/params.py
+++ b/official/projects/edgetpu/nlp/configs/params.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_baseline.yaml
+++ b/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_baseline.yaml
@@ -13,7 +13,7 @@ task:
        num_attention_heads: 4
        intermediate_size: 512
        hidden_activation: relu
-        hidden_dropout_prob: 0.0
+        hidden_dropout_prob: 0.1
        attention_probs_dropout_prob: 0.1
        intra_bottleneck_size: 128
        initializer_range: 0.02

--- a/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_xxs.yaml
+++ b/official/projects/edgetpu/nlp/experiments/downstream_tasks/mobilebert_edgetpu_xxs.yaml
+# MobileBERT-EdgeTPU-XXS model.
+task:
+  model:
+    encoder:
+      type: mobilebert
+      mobilebert:
+        word_vocab_size: 30522
+        word_embed_size: 128
+        type_vocab_size: 2
+        max_sequence_length: 512
+        num_blocks: 6
+        hidden_size: 512
+        num_attention_heads: 4
+        intermediate_size: 1024
+        hidden_activation: relu
+        hidden_dropout_prob: 0.1
+        attention_probs_dropout_prob: 0.1
+        intra_bottleneck_size: 128
+        initializer_range: 0.02
+        key_query_shared_bottleneck: true
+        num_feedforward_networks: 2
+        normalization_type: no_norm
+        classifier_activation: false
--- a/official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_xxs.yaml
+++ b/official/projects/edgetpu/nlp/experiments/mobilebert_edgetpu_xxs.yaml
+layer_wise_distillation:
+  num_steps: 30000
+  warmup_steps: 0
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-3
+  decay_steps: 30000
+end_to_end_distillation:
+  num_steps: 585000
+  warmup_steps: 20000
+  initial_learning_rate: 1.5e-3
+  end_learning_rate: 1.5e-7
+  decay_steps: 585000
+  distill_ground_truth_ratio: 0.5
+optimizer:
+  optimizer:
+    lamb:
+      beta_1: 0.9
+      beta_2: 0.999
+      clipnorm: 1.0
+      epsilon: 1.0e-06
+      exclude_from_layer_adaptation: null
+      exclude_from_weight_decay: ['LayerNorm', 'bias', 'norm']
+      global_clipnorm: null
+      name: LAMB
+      weight_decay_rate: 0.01
+    type: lamb
+orbit_config:
+  eval_interval: 1000
+  eval_steps: -1
+  mode: train
+  steps_per_loop: 1000
+  total_steps: 825000
+runtime:
+  distribution_strategy: 'tpu'
+student_model:
+  cls_heads: [{'activation': 'tanh',
+               'cls_token_idx': 0,
+               'dropout_rate': 0.0,
+               'inner_dim': 512,
+               'name': 'next_sentence',
+               'num_classes': 2}]
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: relu
+      hidden_dropout_prob: 0.0
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 1024
+      intra_bottleneck_size: 128
+      key_query_shared_bottleneck: true
+      max_sequence_length: 512
+      normalization_type: no_norm
+      num_attention_heads: 4
+      num_blocks: 6
+      num_feedforward_networks: 2
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: relu
+  mlm_initializer_range: 0.02
+  mlm_output_weights_use_proj: true
+teacher_model:
+  cls_heads: []
+  encoder:
+    mobilebert:
+      attention_probs_dropout_prob: 0.1
+      classifier_activation: false
+      hidden_activation: gelu
+      hidden_dropout_prob: 0.1
+      hidden_size: 512
+      initializer_range: 0.02
+      input_mask_dtype: int32
+      intermediate_size: 4096
+      intra_bottleneck_size: 1024
+      key_query_shared_bottleneck: false
+      max_sequence_length: 512
+      normalization_type: layer_norm
+      num_attention_heads: 4
+      num_blocks: 24
+      num_feedforward_networks: 1
+      type_vocab_size: 2
+      use_bottleneck_attention: false
+      word_embed_size: 128
+      word_vocab_size: 30522
+    type: mobilebert
+  mlm_activation: gelu
+  mlm_initializer_range: 0.02
+teacher_model_init_checkpoint: gs://**/uncased_L-24_H-1024_B-512_A-4_teacher/tf2_checkpoint/bert_model.ckpt-1
+student_model_init_checkpoint: ''
+train_datasest:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord*,gs://**/seq_512_mask_20/books.tfrecord*
+  is_training: true
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
+eval_dataset:
+  block_length: 1
+  cache: false
+  cycle_length: null
+  deterministic: null
+  drop_remainder: true
+  enable_tf_data_service: false
+  global_batch_size: 2048
+  input_path: gs://**/seq_512_mask_20/wikipedia.tfrecord-00141-of-00500,gs://**/seq_512_mask_20/books.tfrecord-00141-of-00500
+  is_training: false
+  max_predictions_per_seq: 20
+  seq_length: 512
+  sharding: true
+  shuffle_buffer_size: 100
+  tf_data_service_address: null
+  tf_data_service_job_name: null
+  tfds_as_supervised: false
+  tfds_data_dir: ''
+  tfds_name: ''
+  tfds_skip_decoding_feature: ''
+  tfds_split: ''
+  use_next_sentence_label: true
+  use_position_id: false
+  use_v2_feature_names: false
--- a/official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer.py
+++ b/official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer_test.py
+++ b/official/projects/edgetpu/nlp/mobilebert_edgetpu_trainer_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/projects/edgetpu/nlp/modeling/__init__.py
+++ b/official/projects/edgetpu/nlp/modeling/__init__.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/projects/edgetpu/nlp/modeling/edgetpu_layers.py
+++ b/official/projects/edgetpu/nlp/modeling/edgetpu_layers.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -123,7 +123,7 @@ class EdgeTPUMultiHeadAttention(tf.keras.layers.MultiHeadAttention):
    """Builds multi-head dot-product attention computations.

    This function builds attributes necessary for `_compute_attention` to
-    costomize attention computation to replace the default dot-product
+    customize attention computation to replace the default dot-product
    attention.

    Args:

--- a/official/projects/edgetpu/nlp/modeling/edgetpu_layers_test.py
+++ b/official/projects/edgetpu/nlp/modeling/edgetpu_layers_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/projects/edgetpu/nlp/modeling/encoder.py
+++ b/official/projects/edgetpu/nlp/modeling/encoder.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -161,7 +161,7 @@ class MobileBERTEncoder(tf.keras.Model):
    first_token = tf.squeeze(prev_output[:, 0:1, :], axis=1)

    if classifier_activation:
-      self._pooler_layer = tf.keras.layers.experimental.EinsumDense(
+      self._pooler_layer = tf.keras.layers.EinsumDense(
          'ab,bc->ac',
          output_shape=hidden_size,
          activation=tf.tanh,

--- a/official/projects/edgetpu/nlp/modeling/model_builder.py
+++ b/official/projects/edgetpu/nlp/modeling/model_builder.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -85,6 +85,7 @@ def build_bert_pretrainer(pretrainer_cfg: params.PretrainerModelParams,
      activation=tf_utils.get_activation(pretrainer_cfg.mlm_activation),
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=pretrainer_cfg.mlm_initializer_range),
+      output_weights_use_proj=pretrainer_cfg.mlm_output_weights_use_proj,
      name='cls/predictions')

  pretrainer = edgetpu_pretrainer.MobileBERTEdgeTPUPretrainer(

--- a/official/projects/edgetpu/nlp/modeling/model_builder_test.py
+++ b/official/projects/edgetpu/nlp/modeling/model_builder_test.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

--- a/official/projects/edgetpu/nlp/modeling/pretrainer.py
+++ b/official/projects/edgetpu/nlp/modeling/pretrainer.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.