Merge remote-tracking branch 'tf_model/main'

472e2f80 · zhanggzh · d91296eb · f3a14f85 · 472e2f80 · 472e2f80
Commit 472e2f80 authored Mar 16, 2024 by zhanggzh
20 changed files
--- a/models-2.13.1/official/legacy/detection/modeling/retinanet_model.py
+++ b/models-2.13.1/official/legacy/detection/modeling/retinanet_model.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model defination for the RetinaNet Model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from official.legacy.detection.dataloader import mode_keys
+from official.legacy.detection.evaluation import factory as eval_factory
+from official.legacy.detection.modeling import base_model
+from official.legacy.detection.modeling import losses
+from official.legacy.detection.modeling.architecture import factory
+from official.legacy.detection.ops import postprocess_ops
+class RetinanetModel(base_model.Model):
+  """RetinaNet model function."""
+  def __init__(self, params):
+    super(RetinanetModel, self).__init__(params)
+    # For eval metrics.
+    self._params = params
+    # Architecture generators.
+    self._backbone_fn = factory.backbone_generator(params)
+    self._fpn_fn = factory.multilevel_features_generator(params)
+    self._head_fn = factory.retinanet_head_generator(params)
+    # Loss function.
+    self._cls_loss_fn = losses.RetinanetClassLoss(
+        params.retinanet_loss, params.architecture.num_classes)
+    self._box_loss_fn = losses.RetinanetBoxLoss(params.retinanet_loss)
+    self._box_loss_weight = params.retinanet_loss.box_loss_weight
+    self._keras_model = None
+    # Predict function.
+    self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
+        params.architecture.min_level, params.architecture.max_level,
+        params.postprocess)
+    self._transpose_input = params.train.transpose_input
+    assert not self._transpose_input, 'Transpose input is not supported.'
+    # Input layer.
+    self._input_layer = tf.keras.layers.Input(
+        shape=(None, None, params.retinanet_parser.num_channels),
+        name='',
+        dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32)
+  def build_outputs(self, inputs, mode):
+    # If the input image is transposed (from NHWC to HWCN), we need to revert it
+    # back to the original shape before it's used in the computation.
+    if self._transpose_input:
+      inputs = tf.transpose(inputs, [3, 0, 1, 2])
+    backbone_features = self._backbone_fn(
+        inputs, is_training=(mode == mode_keys.TRAIN))
+    fpn_features = self._fpn_fn(
+        backbone_features, is_training=(mode == mode_keys.TRAIN))
+    cls_outputs, box_outputs = self._head_fn(
+        fpn_features, is_training=(mode == mode_keys.TRAIN))
+    if self._use_bfloat16:
+      levels = cls_outputs.keys()
+      for level in levels:
+        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
+        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
+    model_outputs = {
+        'cls_outputs': cls_outputs,
+        'box_outputs': box_outputs,
+    }
+    return model_outputs
+  def build_loss_fn(self):
+    if self._keras_model is None:
+      raise ValueError('build_loss_fn() must be called after build_model().')
+    filter_fn = self.make_filter_trainable_variables_fn()
+    trainable_variables = filter_fn(self._keras_model.trainable_variables)
+    def _total_loss_fn(labels, outputs):
+      cls_loss = self._cls_loss_fn(outputs['cls_outputs'],
+                                   labels['cls_targets'],
+                                   labels['num_positives'])
+      box_loss = self._box_loss_fn(outputs['box_outputs'],
+                                   labels['box_targets'],
+                                   labels['num_positives'])
+      model_loss = cls_loss + self._box_loss_weight * box_loss
+      l2_regularization_loss = self.weight_decay_loss(trainable_variables)
+      total_loss = model_loss + l2_regularization_loss
+      return {
+          'total_loss': total_loss,
+          'cls_loss': cls_loss,
+          'box_loss': box_loss,
+          'model_loss': model_loss,
+          'l2_regularization_loss': l2_regularization_loss,
+      }
+    return _total_loss_fn
+  def build_model(self, params, mode=None):
+    if self._keras_model is None:
+      outputs = self.model_outputs(self._input_layer, mode)
+      model = tf.keras.models.Model(
+          inputs=self._input_layer, outputs=outputs, name='retinanet')
+      assert model is not None, 'Fail to build tf.keras.Model.'
+      model.optimizer = self.build_optimizer()
+      self._keras_model = model
+    return self._keras_model
+  def post_processing(self, labels, outputs):
+    # TODO(yeqing): Moves the output related part into build_outputs.
+    required_output_fields = ['cls_outputs', 'box_outputs']
+    for field in required_output_fields:
+      if field not in outputs:
+        raise ValueError('"%s" is missing in outputs, requried %s found %s' %
+                         (field, required_output_fields, outputs.keys()))
+    required_label_fields = ['image_info', 'groundtruths']
+    for field in required_label_fields:
+      if field not in labels:
+        raise ValueError('"%s" is missing in outputs, requried %s found %s' %
+                         (field, required_label_fields, labels.keys()))
+    boxes, scores, classes, valid_detections = self._generate_detections_fn(
+        outputs['box_outputs'], outputs['cls_outputs'], labels['anchor_boxes'],
+        labels['image_info'][:, 1:2, :])
+    # Discards the old output tensors to save memory. The `cls_outputs` and
+    # `box_outputs` are pretty big and could potentiall lead to memory issue.
+    outputs = {
+        'source_id': labels['groundtruths']['source_id'],
+        'image_info': labels['image_info'],
+        'num_detections': valid_detections,
+        'detection_boxes': boxes,
+        'detection_classes': classes,
+        'detection_scores': scores,
+    }
+    if 'groundtruths' in labels:
+      labels['source_id'] = labels['groundtruths']['source_id']
+      labels['boxes'] = labels['groundtruths']['boxes']
+      labels['classes'] = labels['groundtruths']['classes']
+      labels['areas'] = labels['groundtruths']['areas']
+      labels['is_crowds'] = labels['groundtruths']['is_crowds']
+    return labels, outputs
+  def eval_metrics(self):
+    return eval_factory.evaluator_generator(self._params.eval)
--- a/models-2.13.1/official/legacy/detection/modeling/shapemask_model.py
+++ b/models-2.13.1/official/legacy/detection/modeling/shapemask_model.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model definition for the ShapeMask Model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from official.legacy.detection.dataloader import anchor
+from official.legacy.detection.dataloader import mode_keys
+from official.legacy.detection.evaluation import factory as eval_factory
+from official.legacy.detection.modeling import base_model
+from official.legacy.detection.modeling import losses
+from official.legacy.detection.modeling.architecture import factory
+from official.legacy.detection.ops import postprocess_ops
+from official.legacy.detection.utils import box_utils
+class ShapeMaskModel(base_model.Model):
+  """ShapeMask model function."""
+  def __init__(self, params):
+    super(ShapeMaskModel, self).__init__(params)
+    self._params = params
+    self._keras_model = None
+    # Architecture generators.
+    self._backbone_fn = factory.backbone_generator(params)
+    self._fpn_fn = factory.multilevel_features_generator(params)
+    self._retinanet_head_fn = factory.retinanet_head_generator(params)
+    self._shape_prior_head_fn = factory.shapeprior_head_generator(params)
+    self._coarse_mask_fn = factory.coarsemask_head_generator(params)
+    self._fine_mask_fn = factory.finemask_head_generator(params)
+    # Loss functions.
+    self._cls_loss_fn = losses.RetinanetClassLoss(
+        params.retinanet_loss, params.architecture.num_classes)
+    self._box_loss_fn = losses.RetinanetBoxLoss(params.retinanet_loss)
+    self._box_loss_weight = params.retinanet_loss.box_loss_weight
+    # Mask loss function.
+    self._shapemask_prior_loss_fn = losses.ShapemaskMseLoss()
+    self._shapemask_loss_fn = losses.ShapemaskLoss()
+    self._shape_prior_loss_weight = (
+        params.shapemask_loss.shape_prior_loss_weight)
+    self._coarse_mask_loss_weight = (
+        params.shapemask_loss.coarse_mask_loss_weight)
+    self._fine_mask_loss_weight = (params.shapemask_loss.fine_mask_loss_weight)
+    # Predict function.
+    self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
+        params.architecture.min_level, params.architecture.max_level,
+        params.postprocess)
+  def build_outputs(self, inputs, mode):
+    is_training = mode == mode_keys.TRAIN
+    images = inputs['image']
+    if 'anchor_boxes' in inputs:
+      anchor_boxes = inputs['anchor_boxes']
+    else:
+      anchor_boxes = anchor.Anchor(
+          self._params.architecture.min_level,
+          self._params.architecture.max_level, self._params.anchor.num_scales,
+          self._params.anchor.aspect_ratios, self._params.anchor.anchor_size,
+          images.get_shape().as_list()[1:3]).multilevel_boxes
+      batch_size = tf.shape(images)[0]
+      for level in anchor_boxes:
+        anchor_boxes[level] = tf.tile(
+            tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1, 1])
+    backbone_features = self._backbone_fn(images, is_training=is_training)
+    fpn_features = self._fpn_fn(backbone_features, is_training=is_training)
+    cls_outputs, box_outputs = self._retinanet_head_fn(
+        fpn_features, is_training=is_training)
+    valid_boxes, valid_scores, valid_classes, valid_detections = (
+        self._generate_detections_fn(box_outputs, cls_outputs, anchor_boxes,
+                                     inputs['image_info'][:, 1:2, :]))
+    image_size = images.get_shape().as_list()[1:3]
+    valid_outer_boxes = box_utils.compute_outer_boxes(
+        tf.reshape(valid_boxes, [-1, 4]),
+        image_size,
+        scale=self._params.shapemask_parser.outer_box_scale)
+    valid_outer_boxes = tf.reshape(valid_outer_boxes, tf.shape(valid_boxes))
+    # Wrapping if else code paths into a layer to make the checkpoint loadable
+    # in prediction mode.
+    class SampledBoxesLayer(tf.keras.layers.Layer):
+      """ShapeMask model function."""
+      def call(self, inputs, val_boxes, val_classes, val_outer_boxes, training):
+        if training:
+          boxes = inputs['mask_boxes']
+          outer_boxes = inputs['mask_outer_boxes']
+          classes = inputs['mask_classes']
+        else:
+          boxes = val_boxes
+          classes = val_classes
+          outer_boxes = val_outer_boxes
+        return boxes, classes, outer_boxes
+    boxes, classes, outer_boxes = SampledBoxesLayer()(
+        inputs,
+        valid_boxes,
+        valid_classes,
+        valid_outer_boxes,
+        training=is_training)
+    instance_features, prior_masks = self._shape_prior_head_fn(
+        fpn_features, boxes, outer_boxes, classes, is_training)
+    coarse_mask_logits = self._coarse_mask_fn(instance_features, prior_masks,
+                                              classes, is_training)
+    fine_mask_logits = self._fine_mask_fn(instance_features, coarse_mask_logits,
+                                          classes, is_training)
+    model_outputs = {
+        'cls_outputs': cls_outputs,
+        'box_outputs': box_outputs,
+        'fine_mask_logits': fine_mask_logits,
+        'coarse_mask_logits': coarse_mask_logits,
+        'prior_masks': prior_masks,
+    }
+    if not is_training:
+      model_outputs.update({
+          'num_detections': valid_detections,
+          'detection_boxes': valid_boxes,
+          'detection_outer_boxes': valid_outer_boxes,
+          'detection_masks': fine_mask_logits,
+          'detection_classes': valid_classes,
+          'detection_scores': valid_scores,
+      })
+    return model_outputs
+  def build_loss_fn(self):
+    if self._keras_model is None:
+      raise ValueError('build_loss_fn() must be called after build_model().')
+    filter_fn = self.make_filter_trainable_variables_fn()
+    trainable_variables = filter_fn(self._keras_model.trainable_variables)
+    def _total_loss_fn(labels, outputs):
+      cls_loss = self._cls_loss_fn(outputs['cls_outputs'],
+                                   labels['cls_targets'],
+                                   labels['num_positives'])
+      box_loss = self._box_loss_fn(outputs['box_outputs'],
+                                   labels['box_targets'],
+                                   labels['num_positives'])
+      # Adds Shapemask model losses.
+      shape_prior_loss = self._shapemask_prior_loss_fn(outputs['prior_masks'],
+                                                       labels['mask_targets'],
+                                                       labels['mask_is_valid'])
+      coarse_mask_loss = self._shapemask_loss_fn(outputs['coarse_mask_logits'],
+                                                 labels['mask_targets'],
+                                                 labels['mask_is_valid'])
+      fine_mask_loss = self._shapemask_loss_fn(outputs['fine_mask_logits'],
+                                               labels['fine_mask_targets'],
+                                               labels['mask_is_valid'])
+      model_loss = (
+          cls_loss + self._box_loss_weight * box_loss +
+          shape_prior_loss * self._shape_prior_loss_weight +
+          coarse_mask_loss * self._coarse_mask_loss_weight +
+          fine_mask_loss * self._fine_mask_loss_weight)
+      l2_regularization_loss = self.weight_decay_loss(trainable_variables)
+      total_loss = model_loss + l2_regularization_loss
+      shapemask_losses = {
+          'total_loss': total_loss,
+          'loss': total_loss,
+          'retinanet_cls_loss': cls_loss,
+          'l2_regularization_loss': l2_regularization_loss,
+          'retinanet_box_loss': box_loss,
+          'shapemask_prior_loss': shape_prior_loss,
+          'shapemask_coarse_mask_loss': coarse_mask_loss,
+          'shapemask_fine_mask_loss': fine_mask_loss,
+          'model_loss': model_loss,
+      }
+      return shapemask_losses
+    return _total_loss_fn
+  def build_input_layers(self, params, mode):
+    is_training = mode == mode_keys.TRAIN
+    input_shape = (
+        params.shapemask_parser.output_size +
+        [params.shapemask_parser.num_channels])
+    if is_training:
+      batch_size = params.train.batch_size
+      input_layer = {
+          'image':
+              tf.keras.layers.Input(
+                  shape=input_shape,
+                  batch_size=batch_size,
+                  name='image',
+                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
+          'image_info':
+              tf.keras.layers.Input(
+                  shape=[4, 2], batch_size=batch_size, name='image_info'),
+          'mask_classes':
+              tf.keras.layers.Input(
+                  shape=[params.shapemask_parser.num_sampled_masks],
+                  batch_size=batch_size,
+                  name='mask_classes',
+                  dtype=tf.int64),
+          'mask_outer_boxes':
+              tf.keras.layers.Input(
+                  shape=[params.shapemask_parser.num_sampled_masks, 4],
+                  batch_size=batch_size,
+                  name='mask_outer_boxes',
+                  dtype=tf.float32),
+          'mask_boxes':
+              tf.keras.layers.Input(
+                  shape=[params.shapemask_parser.num_sampled_masks, 4],
+                  batch_size=batch_size,
+                  name='mask_boxes',
+                  dtype=tf.float32),
+      }
+    else:
+      batch_size = params.eval.batch_size
+      input_layer = {
+          'image':
+              tf.keras.layers.Input(
+                  shape=input_shape,
+                  batch_size=batch_size,
+                  name='image',
+                  dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
+          'image_info':
+              tf.keras.layers.Input(
+                  shape=[4, 2], batch_size=batch_size, name='image_info'),
+      }
+    return input_layer
+  def build_model(self, params, mode):
+    if self._keras_model is None:
+      input_layers = self.build_input_layers(self._params, mode)
+      outputs = self.model_outputs(input_layers, mode)
+      model = tf.keras.models.Model(
+          inputs=input_layers, outputs=outputs, name='shapemask')
+      assert model is not None, 'Fail to build tf.keras.Model.'
+      model.optimizer = self.build_optimizer()
+      self._keras_model = model
+    return self._keras_model
+  def post_processing(self, labels, outputs):
+    required_output_fields = [
+        'num_detections', 'detection_boxes', 'detection_classes',
+        'detection_masks', 'detection_scores'
+    ]
+    for field in required_output_fields:
+      if field not in outputs:
+        raise ValueError(
+            '"{}" is missing in outputs, requried {} found {}'.format(
+                field, required_output_fields, outputs.keys()))
+    required_label_fields = ['image_info']
+    for field in required_label_fields:
+      if field not in labels:
+        raise ValueError(
+            '"{}" is missing in labels, requried {} found {}'.format(
+                field, required_label_fields, labels.keys()))
+    predictions = {
+        'image_info': labels['image_info'],
+        'num_detections': outputs['num_detections'],
+        'detection_boxes': outputs['detection_boxes'],
+        'detection_outer_boxes': outputs['detection_outer_boxes'],
+        'detection_classes': outputs['detection_classes'],
+        'detection_scores': outputs['detection_scores'],
+        'detection_masks': outputs['detection_masks'],
+    }
+    if 'groundtruths' in labels:
+      predictions['source_id'] = labels['groundtruths']['source_id']
+      labels = labels['groundtruths']
+    return labels, predictions
+  def eval_metrics(self):
+    return eval_factory.evaluator_generator(self._params.eval)
--- a/models-2.13.1/official/legacy/detection/ops/__init__.py
+++ b/models-2.13.1/official/legacy/detection/ops/__init__.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/models-2.13.1/official/legacy/detection/ops/nms.py
+++ b/models-2.13.1/official/legacy/detection/ops/nms.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tensorflow implementation of non max suppression."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from official.legacy.detection.utils import box_utils
+NMS_TILE_SIZE = 512
+def _self_suppression(iou, _, iou_sum):
+  batch_size = tf.shape(iou)[0]
+  can_suppress_others = tf.cast(
+      tf.reshape(tf.reduce_max(iou, 1) <= 0.5, [batch_size, -1, 1]), iou.dtype)
+  iou_suppressed = tf.reshape(
+      tf.cast(tf.reduce_max(can_suppress_others * iou, 1) <= 0.5, iou.dtype),
+      [batch_size, -1, 1]) * iou
+  iou_sum_new = tf.reduce_sum(iou_suppressed, [1, 2])
+  return [
+      iou_suppressed,
+      tf.reduce_any(iou_sum - iou_sum_new > 0.5), iou_sum_new
+  ]
+def _cross_suppression(boxes, box_slice, iou_threshold, inner_idx):
+  batch_size = tf.shape(boxes)[0]
+  new_slice = tf.slice(boxes, [0, inner_idx * NMS_TILE_SIZE, 0],
+                       [batch_size, NMS_TILE_SIZE, 4])
+  iou = box_utils.bbox_overlap(new_slice, box_slice)
+  ret_slice = tf.expand_dims(
+      tf.cast(tf.reduce_all(iou < iou_threshold, [1]), box_slice.dtype),
+      2) * box_slice
+  return boxes, ret_slice, iou_threshold, inner_idx + 1
+def _suppression_loop_body(boxes, iou_threshold, output_size, idx):
+  """Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE).
+  Args:
+    boxes: a tensor with a shape of [batch_size, anchors, 4].
+    iou_threshold: a float representing the threshold for deciding whether boxes
+      overlap too much with respect to IOU.
+    output_size: an int32 tensor of size [batch_size]. Representing the number
+      of selected boxes for each batch.
+    idx: an integer scalar representing induction variable.
+  Returns:
+    boxes: updated boxes.
+    iou_threshold: pass down iou_threshold to the next iteration.
+    output_size: the updated output_size.
+    idx: the updated induction variable.
+  """
+  boxes_shape = tf.shape(boxes)
+  num_tiles = boxes_shape[1] // NMS_TILE_SIZE
+  batch_size = boxes_shape[0]
+  # Iterates over tiles that can possibly suppress the current tile.
+  box_slice = tf.slice(boxes, [0, idx * NMS_TILE_SIZE, 0],
+                       [batch_size, NMS_TILE_SIZE, 4])
+  _, box_slice, _, _ = tf.while_loop(
+      lambda _boxes, _box_slice, _threshold, inner_idx: inner_idx < idx,
+      _cross_suppression, [boxes, box_slice, iou_threshold,
+                           tf.constant(0)])
+  # Iterates over the current tile to compute self-suppression.
+  iou = box_utils.bbox_overlap(box_slice, box_slice)
+  mask = tf.expand_dims(
+      tf.reshape(tf.range(NMS_TILE_SIZE), [1, -1]) > tf.reshape(
+          tf.range(NMS_TILE_SIZE), [-1, 1]), 0)
+  iou *= tf.cast(tf.logical_and(mask, iou >= iou_threshold), iou.dtype)
+  suppressed_iou, _, _ = tf.while_loop(
+      lambda _iou, loop_condition, _iou_sum: loop_condition, _self_suppression,
+      [iou, tf.constant(True),
+       tf.reduce_sum(iou, [1, 2])])
+  suppressed_box = tf.reduce_sum(suppressed_iou, 1) > 0
+  box_slice *= tf.expand_dims(1.0 - tf.cast(suppressed_box, box_slice.dtype), 2)
+  # Uses box_slice to update the input boxes.
+  mask = tf.reshape(
+      tf.cast(tf.equal(tf.range(num_tiles), idx), boxes.dtype), [1, -1, 1, 1])
+  boxes = tf.tile(tf.expand_dims(
+      box_slice, [1]), [1, num_tiles, 1, 1]) * mask + tf.reshape(
+          boxes, [batch_size, num_tiles, NMS_TILE_SIZE, 4]) * (1 - mask)
+  boxes = tf.reshape(boxes, boxes_shape)
+  # Updates output_size.
+  output_size += tf.reduce_sum(
+      tf.cast(tf.reduce_any(box_slice > 0, [2]), tf.int32), [1])
+  return boxes, iou_threshold, output_size, idx + 1
+def sorted_non_max_suppression_padded(scores, boxes, max_output_size,
+                                      iou_threshold):
+  """A wrapper that handles non-maximum suppression.
+  Assumption:
+    * The boxes are sorted by scores unless the box is a dot (all coordinates
+      are zero).
+    * Boxes with higher scores can be used to suppress boxes with lower scores.
+  The overal design of the algorithm is to handle boxes tile-by-tile:
+  boxes = boxes.pad_to_multiply_of(tile_size)
+  num_tiles = len(boxes) // tile_size
+  output_boxes = []
+  for i in range(num_tiles):
+    box_tile = boxes[i*tile_size : (i+1)*tile_size]
+    for j in range(i - 1):
+      suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
+      iou = bbox_overlap(box_tile, suppressing_tile)
+      # if the box is suppressed in iou, clear it to a dot
+      box_tile *= _update_boxes(iou)
+    # Iteratively handle the diagnal tile.
+    iou = _box_overlap(box_tile, box_tile)
+    iou_changed = True
+    while iou_changed:
+      # boxes that are not suppressed by anything else
+      suppressing_boxes = _get_suppressing_boxes(iou)
+      # boxes that are suppressed by suppressing_boxes
+      suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
+      # clear iou to 0 for boxes that are suppressed, as they cannot be used
+      # to suppress other boxes any more
+      new_iou = _clear_iou(iou, suppressed_boxes)
+      iou_changed = (new_iou != iou)
+      iou = new_iou
+    # remaining boxes that can still suppress others, are selected boxes.
+    output_boxes.append(_get_suppressing_boxes(iou))
+    if len(output_boxes) >= max_output_size:
+      break
+  Args:
+    scores: a tensor with a shape of [batch_size, anchors].
+    boxes: a tensor with a shape of [batch_size, anchors, 4].
+    max_output_size: a scalar integer `Tensor` representing the maximum number
+      of boxes to be selected by non max suppression.
+    iou_threshold: a float representing the threshold for deciding whether boxes
+      overlap too much with respect to IOU.
+  Returns:
+    nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
+      dtype as input scores.
+    nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
+      same dtype as input boxes.
+  """
+  batch_size = tf.shape(boxes)[0]
+  num_boxes = tf.shape(boxes)[1]
+  pad = tf.cast(
+      tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE),
+      tf.int32) * NMS_TILE_SIZE - num_boxes
+  boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
+  scores = tf.pad(
+      tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1)
+  num_boxes += pad
+  def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
+    return tf.logical_and(
+        tf.reduce_min(output_size) < max_output_size,
+        idx < num_boxes // NMS_TILE_SIZE)
+  selected_boxes, _, output_size, _ = tf.while_loop(
+      _loop_cond, _suppression_loop_body,
+      [boxes, iou_threshold,
+       tf.zeros([batch_size], tf.int32),
+       tf.constant(0)])
+  idx = num_boxes - tf.cast(
+      tf.nn.top_k(
+          tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
+          tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
+      tf.int32)
+  idx = tf.minimum(idx, num_boxes - 1)
+  idx = tf.reshape(idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]),
+                   [-1])
+  boxes = tf.reshape(
+      tf.gather(tf.reshape(boxes, [-1, 4]), idx),
+      [batch_size, max_output_size, 4])
+  boxes = boxes * tf.cast(
+      tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape(
+          output_size, [-1, 1, 1]), boxes.dtype)
+  scores = tf.reshape(
+      tf.gather(tf.reshape(scores, [-1, 1]), idx),
+      [batch_size, max_output_size])
+  scores = scores * tf.cast(
+      tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape(
+          output_size, [-1, 1]), scores.dtype)
+  return scores, boxes
--- a/models-2.13.1/official/legacy/detection/ops/postprocess_ops.py
+++ b/models-2.13.1/official/legacy/detection/ops/postprocess_ops.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Post-processing model outputs to generate detection."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import functools
+import tensorflow as tf
+from official.legacy.detection.ops import nms
+from official.legacy.detection.utils import box_utils
+def generate_detections_factory(params):
+  """Factory to select function to generate detection."""
+  if params.use_batched_nms:
+    func = functools.partial(
+        _generate_detections_batched,
+        max_total_size=params.max_total_size,
+        nms_iou_threshold=params.nms_iou_threshold,
+        score_threshold=params.score_threshold)
+  else:
+    func = functools.partial(
+        _generate_detections,
+        max_total_size=params.max_total_size,
+        nms_iou_threshold=params.nms_iou_threshold,
+        score_threshold=params.score_threshold,
+        pre_nms_num_boxes=params.pre_nms_num_boxes)
+  return func
+def _select_top_k_scores(scores_in, pre_nms_num_detections):
+  """Select top_k scores and indices for each class.
+  Args:
+    scores_in: a Tensor with shape [batch_size, N, num_classes], which stacks
+      class logit outputs on all feature levels. The N is the number of total
+      anchors on all levels. The num_classes is the number of classes predicted
+      by the model.
+    pre_nms_num_detections: Number of candidates before NMS.
+  Returns:
+    scores and indices: Tensors with shape [batch_size, pre_nms_num_detections,
+      num_classes].
+  """
+  batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
+  scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
+  scores_trans = tf.reshape(scores_trans, [-1, num_anchors])
+  top_k_scores, top_k_indices = tf.nn.top_k(
+      scores_trans, k=pre_nms_num_detections, sorted=True)
+  top_k_scores = tf.reshape(top_k_scores,
+                            [batch_size, num_class, pre_nms_num_detections])
+  top_k_indices = tf.reshape(top_k_indices,
+                             [batch_size, num_class, pre_nms_num_detections])
+  return tf.transpose(top_k_scores,
+                      [0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1])
+def _generate_detections(boxes,
+                         scores,
+                         max_total_size=100,
+                         nms_iou_threshold=0.3,
+                         score_threshold=0.05,
+                         pre_nms_num_boxes=5000):
+  """Generate the final detections given the model outputs.
+  This uses classes unrolling with while loop based NMS, could be parralled
+  at batch dimension.
+  Args:
+    boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
+      N, 1, 4], which box predictions on all feature levels. The N is the number
+      of total anchors on all levels.
+    scores: a tensor with shape [batch_size, N, num_classes], which stacks class
+      probability on all feature levels. The N is the number of total anchors on
+      all levels. The num_classes is the number of classes predicted by the
+      model. Note that the class_outputs here is the raw score.
+    max_total_size: a scalar representing maximum number of boxes retained over
+      all classes.
+    nms_iou_threshold: a float representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    score_threshold: a float representing the threshold for deciding when to
+      remove boxes based on score.
+    pre_nms_num_boxes: an int number of top candidate detections per class
+      before NMS.
+  Returns:
+    nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
+      representing top detected boxes in [y1, x1, y2, x2].
+    nms_scores: `float` Tensor of shape [batch_size, max_total_size]
+      representing sorted confidence scores for detected boxes. The values are
+      between [0, 1].
+    nms_classes: `int` Tensor of shape [batch_size, max_total_size] representing
+      classes for detected boxes.
+    valid_detections: `int` Tensor of shape [batch_size] only the top
+      `valid_detections` boxes are valid detections.
+  """
+  with tf.name_scope('generate_detections'):
+    nmsed_boxes = []
+    nmsed_classes = []
+    nmsed_scores = []
+    valid_detections = []
+    batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list()
+    _, total_anchors, num_classes = scores.get_shape().as_list()
+    # Selects top pre_nms_num scores and indices before NMS.
+    scores, indices = _select_top_k_scores(
+        scores, min(total_anchors, pre_nms_num_boxes))
+    for i in range(num_classes):
+      boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :]
+      scores_i = scores[:, :, i]
+      # Obtains pre_nms_num_boxes before running NMS.
+      boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1)
+      # Filter out scores.
+      boxes_i, scores_i = box_utils.filter_boxes_by_scores(
+          boxes_i, scores_i, min_score_threshold=score_threshold)
+      (nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded(
+          tf.cast(scores_i, tf.float32),
+          tf.cast(boxes_i, tf.float32),
+          max_total_size,
+          iou_threshold=nms_iou_threshold)
+      nmsed_classes_i = tf.fill([batch_size, max_total_size], i)
+      nmsed_boxes.append(nmsed_boxes_i)
+      nmsed_scores.append(nmsed_scores_i)
+      nmsed_classes.append(nmsed_classes_i)
+  nmsed_boxes = tf.concat(nmsed_boxes, axis=1)
+  nmsed_scores = tf.concat(nmsed_scores, axis=1)
+  nmsed_classes = tf.concat(nmsed_classes, axis=1)
+  nmsed_scores, indices = tf.nn.top_k(
+      nmsed_scores, k=max_total_size, sorted=True)
+  nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1)
+  nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1)
+  valid_detections = tf.reduce_sum(
+      input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32), axis=1)
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+def _generate_detections_per_image(boxes,
+                                   scores,
+                                   max_total_size=100,
+                                   nms_iou_threshold=0.3,
+                                   score_threshold=0.05,
+                                   pre_nms_num_boxes=5000):
+  """Generate the final detections per image given the model outputs.
+  Args:
+    boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box
+      predictions on all feature levels. The N is the number of total anchors on
+      all levels.
+    scores: a tensor with shape [N, num_classes], which stacks class probability
+      on all feature levels. The N is the number of total anchors on all levels.
+      The num_classes is the number of classes predicted by the model. Note that
+      the class_outputs here is the raw score.
+    max_total_size: a scalar representing maximum number of boxes retained over
+      all classes.
+    nms_iou_threshold: a float representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    score_threshold: a float representing the threshold for deciding when to
+      remove boxes based on score.
+    pre_nms_num_boxes: an int number of top candidate detections per class
+      before NMS.
+  Returns:
+    nms_boxes: `float` Tensor of shape [max_total_size, 4] representing top
+      detected boxes in [y1, x1, y2, x2].
+    nms_scores: `float` Tensor of shape [max_total_size] representing sorted
+      confidence scores for detected boxes. The values are between [0, 1].
+    nms_classes: `int` Tensor of shape [max_total_size] representing classes for
+      detected boxes.
+    valid_detections: `int` Tensor of shape [1] only the top `valid_detections`
+      boxes are valid detections.
+  """
+  nmsed_boxes = []
+  nmsed_scores = []
+  nmsed_classes = []
+  num_classes_for_box = boxes.get_shape().as_list()[1]
+  num_classes = scores.get_shape().as_list()[1]
+  for i in range(num_classes):
+    boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
+    scores_i = scores[:, i]
+    # Obtains pre_nms_num_boxes before running NMS.
+    scores_i, indices = tf.nn.top_k(
+        scores_i, k=tf.minimum(tf.shape(input=scores_i)[-1], pre_nms_num_boxes))
+    boxes_i = tf.gather(boxes_i, indices)
+    (nmsed_indices_i, nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
+        tf.cast(boxes_i, tf.float32),
+        tf.cast(scores_i, tf.float32),
+        max_total_size,
+        iou_threshold=nms_iou_threshold,
+        score_threshold=score_threshold,
+        pad_to_max_output_size=True,
+        name='nms_detections_' + str(i))
+    nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
+    nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
+    # Sets scores of invalid boxes to -1.
+    nmsed_scores_i = tf.where(
+        tf.less(tf.range(max_total_size), [nmsed_num_valid_i]), nmsed_scores_i,
+        -tf.ones_like(nmsed_scores_i))
+    nmsed_classes_i = tf.fill([max_total_size], i)
+    nmsed_boxes.append(nmsed_boxes_i)
+    nmsed_scores.append(nmsed_scores_i)
+    nmsed_classes.append(nmsed_classes_i)
+  # Concats results from all classes and sort them.
+  nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
+  nmsed_scores = tf.concat(nmsed_scores, axis=0)
+  nmsed_classes = tf.concat(nmsed_classes, axis=0)
+  nmsed_scores, indices = tf.nn.top_k(
+      nmsed_scores, k=max_total_size, sorted=True)
+  nmsed_boxes = tf.gather(nmsed_boxes, indices)
+  nmsed_classes = tf.gather(nmsed_classes, indices)
+  valid_detections = tf.reduce_sum(
+      input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+def _generate_detections_batched(boxes, scores, max_total_size,
+                                 nms_iou_threshold, score_threshold):
+  """Generates detected boxes with scores and classes for one-stage detector.
+  The function takes output of multi-level ConvNets and anchor boxes and
+  generates detected boxes. Note that this used batched nms, which is not
+  supported on TPU currently.
+  Args:
+    boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
+      N, 1, 4], which box predictions on all feature levels. The N is the number
+      of total anchors on all levels.
+    scores: a tensor with shape [batch_size, N, num_classes], which stacks class
+      probability on all feature levels. The N is the number of total anchors on
+      all levels. The num_classes is the number of classes predicted by the
+      model. Note that the class_outputs here is the raw score.
+    max_total_size: a scalar representing maximum number of boxes retained over
+      all classes.
+    nms_iou_threshold: a float representing the threshold for deciding whether
+      boxes overlap too much with respect to IOU.
+    score_threshold: a float representing the threshold for deciding when to
+      remove boxes based on score.
+  Returns:
+    nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
+      representing top detected boxes in [y1, x1, y2, x2].
+    nms_scores: `float` Tensor of shape [batch_size, max_total_size]
+      representing sorted confidence scores for detected boxes. The values are
+      between [0, 1].
+    nms_classes: `int` Tensor of shape [batch_size, max_total_size] representing
+      classes for detected boxes.
+    valid_detections: `int` Tensor of shape [batch_size] only the top
+      `valid_detections` boxes are valid detections.
+  """
+  with tf.name_scope('generate_detections'):
+    # TODO(tsungyi): Removes normalization/denomalization once the
+    # tf.image.combined_non_max_suppression is coordinate system agnostic.
+    # Normalizes maximum box cooridinates to 1.
+    normalizer = tf.reduce_max(boxes)
+    boxes /= normalizer
+    (nmsed_boxes, nmsed_scores, nmsed_classes,
+     valid_detections) = tf.image.combined_non_max_suppression(
+         boxes,
+         scores,
+         max_output_size_per_class=max_total_size,
+         max_total_size=max_total_size,
+         iou_threshold=nms_iou_threshold,
+         score_threshold=score_threshold,
+         pad_per_class=False,
+     )
+    # De-normalizes box cooridinates.
+    nmsed_boxes *= normalizer
+  nmsed_classes = tf.cast(nmsed_classes, tf.int32)
+  return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+class MultilevelDetectionGenerator(tf.keras.layers.Layer):
+  """Generates detected boxes with scores and classes for one-stage detector."""
+  def __init__(self, min_level, max_level, params):
+    self._min_level = min_level
+    self._max_level = max_level
+    self._generate_detections = generate_detections_factory(params)
+    super(MultilevelDetectionGenerator, self).__init__(autocast=False)
+  def call(self, box_outputs, class_outputs, anchor_boxes, image_shape):
+    # Collects outputs from all levels into a list.
+    boxes = []
+    scores = []
+    for i in range(self._min_level, self._max_level + 1):
+      box_outputs_i_shape = tf.shape(box_outputs[i])
+      batch_size = box_outputs_i_shape[0]
+      num_anchors_per_locations = box_outputs_i_shape[-1] // 4
+      num_classes = tf.shape(class_outputs[i])[-1] // num_anchors_per_locations
+      # Applies score transformation and remove the implicit background class.
+      scores_i = tf.sigmoid(
+          tf.reshape(class_outputs[i], [batch_size, -1, num_classes]))
+      scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])
+      # Box decoding.
+      # The anchor boxes are shared for all data in a batch.
+      # One stage detector only supports class agnostic box regression.
+      anchor_boxes_i = tf.reshape(anchor_boxes[i], [batch_size, -1, 4])
+      box_outputs_i = tf.reshape(box_outputs[i], [batch_size, -1, 4])
+      boxes_i = box_utils.decode_boxes(box_outputs_i, anchor_boxes_i)
+      # Box clipping.
+      boxes_i = box_utils.clip_boxes(boxes_i, image_shape)
+      boxes.append(boxes_i)
+      scores.append(scores_i)
+    boxes = tf.concat(boxes, axis=1)
+    scores = tf.concat(scores, axis=1)
+    nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+        self._generate_detections(tf.expand_dims(boxes, axis=2), scores))
+    # Adds 1 to offset the background class which has index 0.
+    nmsed_classes += 1
+    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+class GenericDetectionGenerator(tf.keras.layers.Layer):
+  """Generates the final detected boxes with scores and classes."""
+  def __init__(self, params):
+    super(GenericDetectionGenerator, self).__init__(autocast=False)
+    self._generate_detections = generate_detections_factory(params)
+  def call(self, box_outputs, class_outputs, anchor_boxes, image_shape):
+    """Generate final detections.
+    Args:
+      box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
+        representing the class-specific box coordinates relative to anchors.
+      class_outputs: a tensor of shape of [batch_size, K, num_classes]
+        representing the class logits before applying score activiation.
+      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
+        corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: a tensor of shape of [batch_size, 2] storing the image height
+        and width w.r.t. the scaled image, i.e. the same image space as
+        `box_outputs` and `anchor_boxes`.
+    Returns:
+      nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
+        representing top detected boxes in [y1, x1, y2, x2].
+      nms_scores: `float` Tensor of shape [batch_size, max_total_size]
+        representing sorted confidence scores for detected boxes. The values are
+        between [0, 1].
+      nms_classes: `int` Tensor of shape [batch_size, max_total_size]
+        representing classes for detected boxes.
+      valid_detections: `int` Tensor of shape [batch_size] only the top
+        `valid_detections` boxes are valid detections.
+    """
+    class_outputs = tf.nn.softmax(class_outputs, axis=-1)
+    # Removes the background class.
+    class_outputs_shape = tf.shape(class_outputs)
+    batch_size = class_outputs_shape[0]
+    num_locations = class_outputs_shape[1]
+    num_classes = class_outputs_shape[-1]
+    num_detections = num_locations * (num_classes - 1)
+    class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
+    box_outputs = tf.reshape(
+        box_outputs,
+        tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
+    box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
+    anchor_boxes = tf.tile(
+        tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
+    box_outputs = tf.reshape(box_outputs,
+                             tf.stack([batch_size, num_detections, 4], axis=-1))
+    anchor_boxes = tf.reshape(
+        anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))
+    # Box decoding.
+    decoded_boxes = box_utils.decode_boxes(
+        box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])
+    # Box clipping
+    decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)
+    decoded_boxes = tf.reshape(
+        decoded_boxes,
+        tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))
+    nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+        self._generate_detections(decoded_boxes, class_outputs))
+    # Adds 1 to offset the background class which has index 0.
+    nmsed_classes += 1
+    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
+class OlnDetectionGenerator(GenericDetectionGenerator):
+  """Generates the final detected boxes with scores and classes."""
+  def __call__(self, box_outputs, class_outputs, anchor_boxes, image_shape,
+               is_single_fg_score=False, keep_nms=True):
+    """Generate final detections for Object Localization Network (OLN).
+    Args:
+      box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
+        representing the class-specific box coordinates relative to anchors.
+      class_outputs: a tensor of shape of [batch_size, K, num_classes]
+        representing the class logits before applying score activiation.
+      anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
+        corresponding anchor boxes w.r.t `box_outputs`.
+      image_shape: a tensor of shape of [batch_size, 2] storing the image height
+        and width w.r.t. the scaled image, i.e. the same image space as
+        `box_outputs` and `anchor_boxes`.
+      is_single_fg_score: a Bool indicator of whether class_outputs includes the
+        background scores concatenated or not. By default, class_outputs is a
+        concatenation of both scores for the foreground and background. That is,
+        scores_without_bg=False.
+      keep_nms: a Bool indicator of whether to perform NMS or not.
+    Returns:
+      nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
+        representing top detected boxes in [y1, x1, y2, x2].
+      nms_scores: `float` Tensor of shape [batch_size, max_total_size]
+        representing sorted confidence scores for detected boxes. The values are
+        between [0, 1].
+      nms_classes: `int` Tensor of shape [batch_size, max_total_size]
+        representing classes for detected boxes.
+      valid_detections: `int` Tensor of shape [batch_size] only the top
+        `valid_detections` boxes are valid detections.
+    """
+    if is_single_fg_score:
+      # Concatenates dummy background scores.
+      dummy_bg_scores = tf.zeros_like(class_outputs)
+      class_outputs = tf.stack([dummy_bg_scores, class_outputs], -1)
+    else:
+      class_outputs = tf.nn.softmax(class_outputs, axis=-1)
+    # Removes the background class.
+    class_outputs_shape = tf.shape(class_outputs)
+    batch_size = class_outputs_shape[0]
+    num_locations = class_outputs_shape[1]
+    num_classes = class_outputs_shape[-1]
+    num_detections = num_locations * (num_classes - 1)
+    class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
+    box_outputs = tf.reshape(
+        box_outputs,
+        tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
+    box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
+    anchor_boxes = tf.tile(
+        tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
+    box_outputs = tf.reshape(box_outputs,
+                             tf.stack([batch_size, num_detections, 4], axis=-1))
+    anchor_boxes = tf.reshape(
+        anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))
+    # Box decoding. For RPN outputs, box_outputs are all zeros.
+    decoded_boxes = box_utils.decode_boxes(
+        box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])
+    # Box clipping
+    decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)
+    decoded_boxes = tf.reshape(
+        decoded_boxes,
+        tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))
+    if keep_nms:
+      nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
+          self._generate_detections(decoded_boxes, class_outputs))
+      # Adds 1 to offset the background class which has index 0.
+      nmsed_classes += 1
+    else:
+      nmsed_boxes = decoded_boxes[:, :, 0, :]
+      nmsed_scores = class_outputs[:, :, 0]
+      nmsed_classes = tf.cast(tf.ones_like(nmsed_scores), tf.int32)
+      valid_detections = tf.cast(
+          tf.reduce_sum(tf.ones_like(nmsed_scores), axis=-1), tf.int32)
+    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
--- a/models-2.13.1/official/legacy/detection/ops/roi_ops.py
+++ b/models-2.13.1/official/legacy/detection/ops/roi_ops.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ROI-related ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from official.legacy.detection.ops import nms
+from official.legacy.detection.utils import box_utils
+def multilevel_propose_rois(rpn_boxes,
+                            rpn_scores,
+                            anchor_boxes,
+                            image_shape,
+                            rpn_pre_nms_top_k=2000,
+                            rpn_post_nms_top_k=1000,
+                            rpn_nms_threshold=0.7,
+                            rpn_score_threshold=0.0,
+                            rpn_min_size_threshold=0.0,
+                            decode_boxes=True,
+                            clip_boxes=True,
+                            use_batched_nms=False,
+                            apply_sigmoid_to_score=True):
+  """Proposes RoIs given a group of candidates from different FPN levels.
+  The following describes the steps:
+    1. For each individual level:
+      a. Apply sigmoid transform if specified.
+      b. Decode boxes if specified.
+      c. Clip boxes if specified.
+      d. Filter small boxes and those fall outside image if specified.
+      e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
+      f. Apply NMS.
+    2. Aggregate post-NMS boxes from each level.
+    3. Apply an overall top k to generate the final selected RoIs.
+  Args:
+    rpn_boxes: a dict with keys representing FPN levels and values representing
+      box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
+    rpn_scores: a dict with keys representing FPN levels and values representing
+      logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
+    anchor_boxes: a dict with keys representing FPN levels and values
+      representing anchor box tensors of shape [batch_size, feature_h,
+      feature_w, num_anchors * 4].
+    image_shape: a tensor of shape [batch_size, 2] where the last dimension are
+      [height, width] of the scaled image.
+    rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
+      keep before applying NMS. Default: 2000.
+    rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to
+      keep after applying NMS. Default: 1000.
+    rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold
+      used for NMS. If 0.0, no NMS is applied. Default: 0.7.
+    rpn_score_threshold: a float between 0 and 1 representing the minimal box
+      score to keep before applying NMS. This is often used as a pre-filtering
+      step for better performance. If 0, no filtering is applied. Default: 0.
+    rpn_min_size_threshold: a float representing the minimal box size in each
+      side (w.r.t. the scaled image) to keep before applying NMS. This is often
+      used as a pre-filtering step for better performance. If 0, no filtering is
+      applied. Default: 0.
+    decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded
+      using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore
+      `anchor_boxes`. Default: True.
+    clip_boxes: a boolean indicating whether boxes are first clipped to the
+      scaled image size before appliying NMS. If False, no clipping is applied
+      and `image_shape` is ignored. Default: True.
+    use_batched_nms: a boolean indicating whether NMS is applied in batch using
+      `tf.image.combined_non_max_suppression`. Currently only available in
+      CPU/GPU. Default: False.
+    apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
+      `rpn_scores` before applying NMS. Default: True.
+  Returns:
+    selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
+      representing the box coordinates of the selected proposals w.r.t. the
+      scaled image.
+    selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k, 1],
+      representing the scores of the selected proposals.
+  """
+  with tf.name_scope('multilevel_propose_rois'):
+    rois = []
+    roi_scores = []
+    image_shape = tf.expand_dims(image_shape, axis=1)
+    for level in sorted(rpn_scores.keys()):
+      with tf.name_scope('level_%d' % level):
+        _, feature_h, feature_w, num_anchors_per_location = (
+            rpn_scores[level].get_shape().as_list())
+        num_boxes = feature_h * feature_w * num_anchors_per_location
+        this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes])
+        this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4])
+        this_level_anchors = tf.cast(
+            tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
+            dtype=this_level_scores.dtype)
+        if apply_sigmoid_to_score:
+          this_level_scores = tf.sigmoid(this_level_scores)
+        if decode_boxes:
+          this_level_boxes = box_utils.decode_boxes(this_level_boxes,
+                                                    this_level_anchors)
+        if clip_boxes:
+          this_level_boxes = box_utils.clip_boxes(this_level_boxes, image_shape)
+        if rpn_min_size_threshold > 0.0:
+          this_level_boxes, this_level_scores = box_utils.filter_boxes(
+              this_level_boxes, this_level_scores, image_shape,
+              rpn_min_size_threshold)
+        this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k)
+        this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k)
+        if rpn_nms_threshold > 0.0:
+          if use_batched_nms:
+            this_level_rois, this_level_roi_scores, _, _ = (
+                tf.image.combined_non_max_suppression(
+                    tf.expand_dims(this_level_boxes, axis=2),
+                    tf.expand_dims(this_level_scores, axis=-1),
+                    max_output_size_per_class=this_level_pre_nms_top_k,
+                    max_total_size=this_level_post_nms_top_k,
+                    iou_threshold=rpn_nms_threshold,
+                    score_threshold=rpn_score_threshold,
+                    pad_per_class=False,
+                    clip_boxes=False))
+          else:
+            if rpn_score_threshold > 0.0:
+              this_level_boxes, this_level_scores = (
+                  box_utils.filter_boxes_by_scores(this_level_boxes,
+                                                   this_level_scores,
+                                                   rpn_score_threshold))
+            this_level_boxes, this_level_scores = box_utils.top_k_boxes(
+                this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
+            this_level_roi_scores, this_level_rois = (
+                nms.sorted_non_max_suppression_padded(
+                    this_level_scores,
+                    this_level_boxes,
+                    max_output_size=this_level_post_nms_top_k,
+                    iou_threshold=rpn_nms_threshold))
+        else:
+          this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
+              this_level_rois, this_level_scores, k=this_level_post_nms_top_k)
+        rois.append(this_level_rois)
+        roi_scores.append(this_level_roi_scores)
+    all_rois = tf.concat(rois, axis=1)
+    all_roi_scores = tf.concat(roi_scores, axis=1)
+    with tf.name_scope('top_k_rois'):
+      _, num_valid_rois = all_roi_scores.get_shape().as_list()
+      overall_top_k = min(num_valid_rois, rpn_post_nms_top_k)
+      selected_rois, selected_roi_scores = box_utils.top_k_boxes(
+          all_rois, all_roi_scores, k=overall_top_k)
+    return selected_rois, selected_roi_scores
+class ROIGenerator(tf.keras.layers.Layer):
+  """Proposes RoIs for the second stage processing."""
+  def __init__(self, params):
+    self._rpn_pre_nms_top_k = params.rpn_pre_nms_top_k
+    self._rpn_post_nms_top_k = params.rpn_post_nms_top_k
+    self._rpn_nms_threshold = params.rpn_nms_threshold
+    self._rpn_score_threshold = params.rpn_score_threshold
+    self._rpn_min_size_threshold = params.rpn_min_size_threshold
+    self._test_rpn_pre_nms_top_k = params.test_rpn_pre_nms_top_k
+    self._test_rpn_post_nms_top_k = params.test_rpn_post_nms_top_k
+    self._test_rpn_nms_threshold = params.test_rpn_nms_threshold
+    self._test_rpn_score_threshold = params.test_rpn_score_threshold
+    self._test_rpn_min_size_threshold = params.test_rpn_min_size_threshold
+    self._use_batched_nms = params.use_batched_nms
+    super(ROIGenerator, self).__init__(autocast=False)
+  def call(self, boxes, scores, anchor_boxes, image_shape, is_training):
+    """Generates RoI proposals.
+    Args:
+      boxes: a dict with keys representing FPN levels and values representing
+        box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
+      scores: a dict with keys representing FPN levels and values representing
+        logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
+      anchor_boxes: a dict with keys representing FPN levels and values
+        representing anchor box tensors of shape [batch_size, feature_h,
+        feature_w, num_anchors * 4].
+      image_shape: a tensor of shape [batch_size, 2] where the last dimension
+        are [height, width] of the scaled image.
+      is_training: a bool indicating whether it is in training or inference
+        mode.
+    Returns:
+      proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
+        representing the box coordinates of the proposed RoIs w.r.t. the
+        scaled image.
+      proposed_roi_scores: a tensor of shape
+        [batch_size, rpn_post_nms_top_k, 1], representing the scores of the
+        proposed RoIs.
+    """
+    proposed_rois, proposed_roi_scores = multilevel_propose_rois(
+        boxes,
+        scores,
+        anchor_boxes,
+        image_shape,
+        rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k
+                           if is_training else self._test_rpn_pre_nms_top_k),
+        rpn_post_nms_top_k=(self._rpn_post_nms_top_k
+                            if is_training else self._test_rpn_post_nms_top_k),
+        rpn_nms_threshold=(self._rpn_nms_threshold
+                           if is_training else self._test_rpn_nms_threshold),
+        rpn_score_threshold=(self._rpn_score_threshold if is_training else
+                             self._test_rpn_score_threshold),
+        rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else
+                                self._test_rpn_min_size_threshold),
+        decode_boxes=True,
+        clip_boxes=True,
+        use_batched_nms=self._use_batched_nms,
+        apply_sigmoid_to_score=True)
+    return proposed_rois, proposed_roi_scores
+class OlnROIGenerator(ROIGenerator):
+  """Proposes RoIs for the second stage processing."""
+  def __call__(self, boxes, scores, anchor_boxes, image_shape, is_training,
+               is_box_lrtb=False, object_scores=None):
+    """Generates RoI proposals.
+    Args:
+      boxes: a dict with keys representing FPN levels and values representing
+        box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
+      scores: a dict with keys representing FPN levels and values representing
+        logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
+      anchor_boxes: a dict with keys representing FPN levels and values
+        representing anchor box tensors of shape [batch_size, feature_h,
+        feature_w, num_anchors * 4].
+      image_shape: a tensor of shape [batch_size, 2] where the last dimension
+        are [height, width] of the scaled image.
+      is_training: a bool indicating whether it is in training or inference
+        mode.
+      is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
+        bottom) format.
+      object_scores: another objectness score (e.g., centerness). In OLN, we use
+        object_scores=centerness as a replacement of the scores at each level.
+        A dict with keys representing FPN levels and values representing logit
+        tensors of shape [batch_size, feature_h, feature_w, num_anchors].
+    Returns:
+      proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
+        representing the box coordinates of the proposed RoIs w.r.t. the
+        scaled image.
+      proposed_roi_scores: a tensor of shape
+        [batch_size, rpn_post_nms_top_k, 1], representing the scores of the
+        proposed RoIs.
+    """
+    proposed_rois, proposed_roi_scores = self.oln_multilevel_propose_rois(
+        boxes,
+        scores,
+        anchor_boxes,
+        image_shape,
+        rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k
+                           if is_training else self._test_rpn_pre_nms_top_k),
+        rpn_post_nms_top_k=(self._rpn_post_nms_top_k
+                            if is_training else self._test_rpn_post_nms_top_k),
+        rpn_nms_threshold=(self._rpn_nms_threshold
+                           if is_training else self._test_rpn_nms_threshold),
+        rpn_score_threshold=(self._rpn_score_threshold if is_training else
+                             self._test_rpn_score_threshold),
+        rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else
+                                self._test_rpn_min_size_threshold),
+        decode_boxes=True,
+        clip_boxes=True,
+        use_batched_nms=self._use_batched_nms,
+        apply_sigmoid_to_score=True,
+        is_box_lrtb=is_box_lrtb,
+        rpn_object_scores=object_scores,)
+    return proposed_rois, proposed_roi_scores
+  def oln_multilevel_propose_rois(self,
+                                  rpn_boxes,
+                                  rpn_scores,
+                                  anchor_boxes,
+                                  image_shape,
+                                  rpn_pre_nms_top_k=2000,
+                                  rpn_post_nms_top_k=1000,
+                                  rpn_nms_threshold=0.7,
+                                  rpn_score_threshold=0.0,
+                                  rpn_min_size_threshold=0.0,
+                                  decode_boxes=True,
+                                  clip_boxes=True,
+                                  use_batched_nms=False,
+                                  apply_sigmoid_to_score=True,
+                                  is_box_lrtb=False,
+                                  rpn_object_scores=None,):
+    """Proposes RoIs given a group of candidates from different FPN levels.
+    The following describes the steps:
+      1. For each individual level:
+        a. Adjust scores for each level if specified by rpn_object_scores.
+        b. Apply sigmoid transform if specified.
+        c. Decode boxes (either of xyhw or left-right-top-bottom format) if
+          specified.
+        d. Clip boxes if specified.
+        e. Filter small boxes and those fall outside image if specified.
+        f. Apply pre-NMS filtering including pre-NMS top k and score
+           thresholding.
+        g. Apply NMS.
+      2. Aggregate post-NMS boxes from each level.
+      3. Apply an overall top k to generate the final selected RoIs.
+    Args:
+      rpn_boxes: a dict with keys representing FPN levels and values
+        representing box tenors of shape [batch_size, feature_h, feature_w,
+        num_anchors * 4].
+      rpn_scores: a dict with keys representing FPN levels and values
+        representing logit tensors of shape [batch_size, feature_h, feature_w,
+        num_anchors].
+      anchor_boxes: a dict with keys representing FPN levels and values
+        representing anchor box tensors of shape [batch_size, feature_h,
+        feature_w, num_anchors * 4].
+      image_shape: a tensor of shape [batch_size, 2] where the last dimension
+        are [height, width] of the scaled image.
+      rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
+        keep before applying NMS. Default: 2000.
+      rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to
+        keep after applying NMS. Default: 1000.
+      rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold
+        used for NMS. If 0.0, no NMS is applied. Default: 0.7.
+      rpn_score_threshold: a float between 0 and 1 representing the minimal box
+        score to keep before applying NMS. This is often used as a pre-filtering
+        step for better performance. If 0, no filtering is applied. Default: 0.
+      rpn_min_size_threshold: a float representing the minimal box size in each
+        side (w.r.t. the scaled image) to keep before applying NMS. This is
+        often used as a pre-filtering step for better performance. If 0, no
+        filtering is applied. Default: 0.
+      decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded
+        using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore
+        `anchor_boxes`. Default: True.
+      clip_boxes: a boolean indicating whether boxes are first clipped to the
+        scaled image size before appliying NMS. If False, no clipping is applied
+        and `image_shape` is ignored. Default: True.
+      use_batched_nms: a boolean indicating whether NMS is applied in batch
+        using `tf.image.combined_non_max_suppression`. Currently only available
+        in CPU/GPU. Default: False.
+      apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
+        `rpn_scores` before applying NMS. Default: True.
+      is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
+        bottom) format.
+      rpn_object_scores: a predicted objectness score (e.g., centerness). In
+        OLN, we use object_scores=centerness as a replacement of the scores at
+        each level. A dict with keys representing FPN levels and values
+        representing logit tensors of shape [batch_size, feature_h, feature_w,
+        num_anchors].
+    Returns:
+      selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
+        representing the box coordinates of the selected proposals w.r.t. the
+        scaled image.
+      selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k,
+      1],representing the scores of the selected proposals.
+    """
+    with tf.name_scope('multilevel_propose_rois'):
+      rois = []
+      roi_scores = []
+      image_shape = tf.expand_dims(image_shape, axis=1)
+      for level in sorted(rpn_scores.keys()):
+        with tf.name_scope('level_%d' % level):
+          _, feature_h, feature_w, num_anchors_per_location = (
+              rpn_scores[level].get_shape().as_list())
+          num_boxes = feature_h * feature_w * num_anchors_per_location
+          this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes])
+          this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4])
+          this_level_anchors = tf.cast(
+              tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
+              dtype=this_level_scores.dtype)
+          if rpn_object_scores:
+            this_level_object_scores = rpn_object_scores[level]
+            this_level_object_scores = tf.reshape(this_level_object_scores,
+                                                  [-1, num_boxes])
+            this_level_object_scores = tf.cast(this_level_object_scores,
+                                               this_level_scores.dtype)
+            this_level_scores = this_level_object_scores
+          if apply_sigmoid_to_score:
+            this_level_scores = tf.sigmoid(this_level_scores)
+          if decode_boxes:
+            if is_box_lrtb:  # Box in left-right-top-bottom format.
+              this_level_boxes = box_utils.decode_boxes_lrtb(
+                  this_level_boxes, this_level_anchors)
+            else:  # Box in standard x-y-h-w format.
+              this_level_boxes = box_utils.decode_boxes(
+                  this_level_boxes, this_level_anchors)
+          if clip_boxes:
+            this_level_boxes = box_utils.clip_boxes(
+                this_level_boxes, image_shape)
+          if rpn_min_size_threshold > 0.0:
+            this_level_boxes, this_level_scores = box_utils.filter_boxes(
+                this_level_boxes, this_level_scores, image_shape,
+                rpn_min_size_threshold)
+          this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k)
+          this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k)
+          if rpn_nms_threshold > 0.0:
+            if use_batched_nms:
+              this_level_rois, this_level_roi_scores, _, _ = (
+                  tf.image.combined_non_max_suppression(
+                      tf.expand_dims(this_level_boxes, axis=2),
+                      tf.expand_dims(this_level_scores, axis=-1),
+                      max_output_size_per_class=this_level_pre_nms_top_k,
+                      max_total_size=this_level_post_nms_top_k,
+                      iou_threshold=rpn_nms_threshold,
+                      score_threshold=rpn_score_threshold,
+                      pad_per_class=False,
+                      clip_boxes=False))
+            else:
+              if rpn_score_threshold > 0.0:
+                this_level_boxes, this_level_scores = (
+                    box_utils.filter_boxes_by_scores(this_level_boxes,
+                                                     this_level_scores,
+                                                     rpn_score_threshold))
+              this_level_boxes, this_level_scores = box_utils.top_k_boxes(
+                  this_level_boxes, this_level_scores,
+                  k=this_level_pre_nms_top_k)
+              this_level_roi_scores, this_level_rois = (
+                  nms.sorted_non_max_suppression_padded(
+                      this_level_scores,
+                      this_level_boxes,
+                      max_output_size=this_level_post_nms_top_k,
+                      iou_threshold=rpn_nms_threshold))
+          else:
+            this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
+                this_level_rois, this_level_scores, k=this_level_post_nms_top_k)
+          rois.append(this_level_rois)
+          roi_scores.append(this_level_roi_scores)
+      all_rois = tf.concat(rois, axis=1)
+      all_roi_scores = tf.concat(roi_scores, axis=1)
+      with tf.name_scope('top_k_rois'):
+        _, num_valid_rois = all_roi_scores.get_shape().as_list()
+        overall_top_k = min(num_valid_rois, rpn_post_nms_top_k)
+        selected_rois, selected_roi_scores = box_utils.top_k_boxes(
+            all_rois, all_roi_scores, k=overall_top_k)
+      return selected_rois, selected_roi_scores
--- a/models-2.13.1/official/legacy/detection/ops/spatial_transform_ops.py
+++ b/models-2.13.1/official/legacy/detection/ops/spatial_transform_ops.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions to performa spatial transformation for Tensor."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+_EPSILON = 1e-8
+def nearest_upsampling(data, scale):
+  """Nearest neighbor upsampling implementation.
+  Args:
+    data: A tensor with a shape of [batch, height_in, width_in, channels].
+    scale: An integer multiple to scale resolution of input data.
+  Returns:
+    data_up: A tensor with a shape of
+      [batch, height_in*scale, width_in*scale, channels]. Same dtype as input
+      data.
+  """
+  with tf.name_scope('nearest_upsampling'):
+    bs, _, _, c = data.get_shape().as_list()
+    shape = tf.shape(input=data)
+    h = shape[1]
+    w = shape[2]
+    bs = -1 if bs is None else bs
+    # Uses reshape to quickly upsample the input.  The nearest pixel is selected
+    # implicitly via broadcasting.
+    data = tf.reshape(data, [bs, h, 1, w, 1, c]) * tf.ones(
+        [1, 1, scale, 1, scale, 1], dtype=data.dtype)
+    return tf.reshape(data, [bs, h * scale, w * scale, c])
+def feature_bilinear_interpolation(features, kernel_y, kernel_x):
+  """Feature bilinear interpolation.
+  The RoIAlign feature f can be computed by bilinear interpolation
+  of four neighboring feature points f0, f1, f2, and f3.
+  f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
+                        [f10, f11]]
+  f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
+  f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
+  kernel_y = [hy, ly]
+  kernel_x = [hx, lx]
+  Args:
+    features: The features are in shape of [batch_size, num_boxes, output_size *
+      2, output_size * 2, num_filters].
+    kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
+    kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
+  Returns:
+    A 5-D tensor representing feature crop of shape
+    [batch_size, num_boxes, output_size, output_size, num_filters].
+  """
+  (batch_size, num_boxes, output_size, _,
+   num_filters) = features.get_shape().as_list()
+  output_size = output_size // 2
+  kernel_y = tf.reshape(kernel_y, [batch_size, num_boxes, output_size * 2, 1])
+  kernel_x = tf.reshape(kernel_x, [batch_size, num_boxes, 1, output_size * 2])
+  # Use implicit broadcast to generate the interpolation kernel. The
+  # multiplier `4` is for avg pooling.
+  interpolation_kernel = kernel_y * kernel_x * 4
+  # Interpolate the gathered features with computed interpolation kernels.
+  features *= tf.cast(
+      tf.expand_dims(interpolation_kernel, axis=-1), dtype=features.dtype)
+  features = tf.reshape(
+      features,
+      [batch_size * num_boxes, output_size * 2, output_size * 2, num_filters])
+  features = tf.nn.avg_pool(features, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID')
+  features = tf.reshape(
+      features, [batch_size, num_boxes, output_size, output_size, num_filters])
+  return features
+def compute_grid_positions(boxes, boundaries, output_size, sample_offset):
+  """Compute the grid position w.r.t.
+  the corresponding feature map.
+  Args:
+    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
+      information of each box w.r.t. the corresponding feature map.
+      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
+      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
+        in terms of the number of pixels of the corresponding feature map size.
+    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
+      the boundary (in (y, x)) of the corresponding feature map for each box.
+      Any resampled grid points that go beyond the bounary will be clipped.
+    output_size: a scalar indicating the output crop size.
+    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
+      from grid point.
+  Returns:
+    kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
+    kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
+    box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2]
+    box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2]
+  """
+  batch_size, num_boxes, _ = boxes.get_shape().as_list()
+  box_grid_x = []
+  box_grid_y = []
+  for i in range(output_size):
+    box_grid_x.append(boxes[:, :, 1] +
+                      (i + sample_offset) * boxes[:, :, 3] / output_size)
+    box_grid_y.append(boxes[:, :, 0] +
+                      (i + sample_offset) * boxes[:, :, 2] / output_size)
+  box_grid_x = tf.stack(box_grid_x, axis=2)
+  box_grid_y = tf.stack(box_grid_y, axis=2)
+  box_grid_y0 = tf.floor(box_grid_y)
+  box_grid_x0 = tf.floor(box_grid_x)
+  box_grid_x0 = tf.maximum(0., box_grid_x0)
+  box_grid_y0 = tf.maximum(0., box_grid_y0)
+  box_grid_x0 = tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1))
+  box_grid_x1 = tf.minimum(box_grid_x0 + 1,
+                           tf.expand_dims(boundaries[:, :, 1], -1))
+  box_grid_y0 = tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1))
+  box_grid_y1 = tf.minimum(box_grid_y0 + 1,
+                           tf.expand_dims(boundaries[:, :, 0], -1))
+  box_gridx0x1 = tf.stack([box_grid_x0, box_grid_x1], axis=-1)
+  box_gridy0y1 = tf.stack([box_grid_y0, box_grid_y1], axis=-1)
+  # The RoIAlign feature f can be computed by bilinear interpolation of four
+  # neighboring feature points f0, f1, f2, and f3.
+  # f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
+  #                       [f10, f11]]
+  # f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
+  # f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
+  ly = box_grid_y - box_grid_y0
+  lx = box_grid_x - box_grid_x0
+  hy = 1.0 - ly
+  hx = 1.0 - lx
+  kernel_y = tf.reshape(
+      tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1])
+  kernel_x = tf.reshape(
+      tf.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1])
+  return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1
+def get_grid_one_hot(box_gridy0y1, box_gridx0x1, feature_height, feature_width):
+  """Get grid_one_hot from indices and feature_size."""
+  (batch_size, num_boxes, output_size, _) = box_gridx0x1.get_shape().as_list()
+  y_indices = tf.cast(
+      tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size, 2]),
+      dtype=tf.int32)
+  x_indices = tf.cast(
+      tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size, 2]),
+      dtype=tf.int32)
+  # shape is [batch_size, num_boxes, output_size, 2, height]
+  grid_y_one_hot = tf.one_hot(tf.cast(y_indices, tf.int32), feature_height)
+  # shape is [batch_size, num_boxes, output_size, 2, width]
+  grid_x_one_hot = tf.one_hot(tf.cast(x_indices, tf.int32), feature_width)
+  return grid_y_one_hot, grid_x_one_hot
+def selective_crop_and_resize(features,
+                              boxes,
+                              box_levels,
+                              boundaries,
+                              output_size=7,
+                              sample_offset=0.5,
+                              use_einsum_gather=False):
+  """Crop and resize boxes on a set of feature maps.
+  Given multiple features maps indexed by different levels, and a set of boxes
+  where each box is mapped to a certain level, it selectively crops and resizes
+  boxes from the corresponding feature maps to generate the box features.
+  We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
+  figure 3 for reference). Specifically, for each feature map, we select an
+  (output_size, output_size) set of pixels corresponding to the box location,
+  and then use bilinear interpolation to select the feature value for each
+  pixel.
+  For performance, we perform the gather and interpolation on all layers as a
+  single operation. In this op the multi-level features are first stacked and
+  gathered into [2*output_size, 2*output_size] feature points. Then bilinear
+  interpolation is performed on the gathered feature points to generate
+  [output_size, output_size] RoIAlign feature map.
+  Here is the step-by-step algorithm:
+    1. The multi-level features are gathered into a
+       [batch_size, num_boxes, output_size*2, output_size*2, num_filters]
+       Tensor. The Tensor contains four neighboring feature points for each
+       vertice in the output grid.
+    2. Compute the interpolation kernel of shape
+       [batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
+       can be seen as stacking 2x2 interpolation kernels for all vertices in the
+       output grid.
+    3. Element-wise multiply the gathered features and interpolation kernel.
+       Then apply 2x2 average pooling to reduce spatial dimension to
+       output_size.
+  Args:
+    features: a 5-D tensor of shape [batch_size, num_levels, max_height,
+      max_width, num_filters] where cropping and resizing are based.
+    boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
+      information of each box w.r.t. the corresponding feature map.
+      boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
+      corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
+        in terms of the number of pixels of the corresponding feature map size.
+    box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
+      the 0-based corresponding feature level index of each box.
+    boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
+      the boundary (in (y, x)) of the corresponding feature map for each box.
+      Any resampled grid points that go beyond the bounary will be clipped.
+    output_size: a scalar indicating the output crop size.
+    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
+      from grid point.
+    use_einsum_gather: use einsum to replace gather or not. Replacing einsum
+      with gather can improve performance when feature size is not large, einsum
+      is friendly with model partition as well. Gather's performance is better
+      when feature size is very large and there are multiple box levels.
+  Returns:
+    features_per_box: a 5-D tensor of shape
+      [batch_size, num_boxes, output_size, output_size, num_filters]
+      representing the cropped features.
+  """
+  (batch_size, num_levels, max_feature_height, max_feature_width,
+   num_filters) = features.get_shape().as_list()
+  _, num_boxes, _ = boxes.get_shape().as_list()
+  kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
+      boxes, boundaries, output_size, sample_offset)
+  x_indices = tf.cast(
+      tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
+      dtype=tf.int32)
+  y_indices = tf.cast(
+      tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
+      dtype=tf.int32)
+  if use_einsum_gather:
+    # Blinear interpolation is done during the last two gathers:
+    #        f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
+    #                              [f10, f11]]
+    #        [[f00, f01],
+    #         [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot)
+    #       where [hy, ly] and [hx, lx] are the bilinear interpolation kernel.
+    # shape is [batch_size, boxes, output_size, 2, 1]
+    grid_y_one_hot, grid_x_one_hot = get_grid_one_hot(box_gridy0y1,
+                                                      box_gridx0x1,
+                                                      max_feature_height,
+                                                      max_feature_width)
+    # shape is [batch_size, num_boxes, output_size, height]
+    grid_y_weight = tf.reduce_sum(
+        tf.multiply(grid_y_one_hot, kernel_y), axis=-2)
+    # shape is [batch_size, num_boxes, output_size, width]
+    grid_x_weight = tf.reduce_sum(
+        tf.multiply(grid_x_one_hot, kernel_x), axis=-2)
+    # Gather for y_axis.
+    # shape is [batch_size, num_boxes, output_size, width, features]
+    features_per_box = tf.einsum('bmhwf,bmoh->bmowf', features,
+                                 tf.cast(grid_y_weight, features.dtype))
+    # Gather for x_axis.
+    # shape is [batch_size, num_boxes, output_size, output_size, features]
+    features_per_box = tf.einsum('bmhwf,bmow->bmhof', features_per_box,
+                                 tf.cast(grid_x_weight, features.dtype))
+  else:
+    height_dim_offset = max_feature_width
+    level_dim_offset = max_feature_height * height_dim_offset
+    batch_dim_offset = num_levels * level_dim_offset
+    batch_size_offset = tf.tile(
+        tf.reshape(
+            tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]),
+        [1, num_boxes, output_size * 2, output_size * 2])
+    box_levels_offset = tf.tile(
+        tf.reshape(box_levels * level_dim_offset,
+                   [batch_size, num_boxes, 1, 1]),
+        [1, 1, output_size * 2, output_size * 2])
+    y_indices_offset = tf.tile(
+        tf.reshape(y_indices * height_dim_offset,
+                   [batch_size, num_boxes, output_size * 2, 1]),
+        [1, 1, 1, output_size * 2])
+    x_indices_offset = tf.tile(
+        tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
+        [1, 1, output_size * 2, 1])
+    indices = tf.reshape(
+        batch_size_offset + box_levels_offset + y_indices_offset +
+        x_indices_offset, [-1])
+    features = tf.reshape(features, [-1, num_filters])
+    # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
+    # performance.
+    features_per_box = tf.reshape(
+        tf.gather(features, indices),
+        [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
+    features_per_box = feature_bilinear_interpolation(features_per_box,
+                                                      kernel_y, kernel_x)
+  return features_per_box
+def multilevel_crop_and_resize(features, boxes, output_size=7):
+  """Crop and resize on multilevel feature pyramid.
+  Generate the (output_size, output_size) set of pixels for each input box
+  by first locating the box into the correct feature level, and then cropping
+  and resizing it using the correspoding feature map of that level.
+  Args:
+    features: A dictionary with key as pyramid level and value as features. The
+      features are in shape of [batch_size, height_l, width_l, num_filters].
+    boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents
+      a box with [y1, x1, y2, x2] in un-normalized coordinates.
+    output_size: A scalar to indicate the output crop size.
+  Returns:
+    A 5-D tensor representing feature crop of shape
+    [batch_size, num_boxes, output_size, output_size, num_filters].
+  """
+  with tf.name_scope('multilevel_crop_and_resize'):
+    levels = list(features.keys())
+    min_level = min(levels)
+    max_level = max(levels)
+    batch_size, max_feature_height, max_feature_width, num_filters = (
+        features[min_level].get_shape().as_list())
+    _, num_boxes, _ = boxes.get_shape().as_list()
+    # Stack feature pyramid into a features_all of shape
+    # [batch_size, levels, height, width, num_filters].
+    features_all = []
+    feature_heights = []
+    feature_widths = []
+    for level in range(min_level, max_level + 1):
+      shape = features[level].get_shape().as_list()
+      feature_heights.append(shape[1])
+      feature_widths.append(shape[2])
+      # Concat tensor of [batch_size, height_l * width_l, num_filters] for each
+      # levels.
+      features_all.append(
+          tf.reshape(features[level], [batch_size, -1, num_filters]))
+      features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters])
+    # Calculate height_l * width_l for each level.
+    level_dim_sizes = [
+        feature_widths[i] * feature_heights[i]
+        for i in range(len(feature_widths))
+    ]
+    # level_dim_offsets is accumulated sum of level_dim_size.
+    level_dim_offsets = [0]
+    for i in range(len(feature_widths) - 1):
+      level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i])
+    batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1]
+    level_dim_offsets = tf.constant(level_dim_offsets, tf.int32)
+    height_dim_sizes = tf.constant(feature_widths, tf.int32)
+    # Assigns boxes to the right level.
+    box_width = boxes[:, :, 3] - boxes[:, :, 1]
+    box_height = boxes[:, :, 2] - boxes[:, :, 0]
+    areas_sqrt = tf.sqrt(box_height * box_width)
+    levels = tf.cast(
+        tf.math.floordiv(
+            tf.math.log(tf.divide(areas_sqrt, 224.0)), tf.math.log(2.0)) + 4.0,
+        dtype=tf.int32)
+    # Maps levels between [min_level, max_level].
+    levels = tf.minimum(max_level, tf.maximum(levels, min_level))
+    # Projects box location and sizes to corresponding feature levels.
+    scale_to_level = tf.cast(
+        tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)),
+        dtype=boxes.dtype)
+    boxes /= tf.expand_dims(scale_to_level, axis=2)
+    box_width /= scale_to_level
+    box_height /= scale_to_level
+    boxes = tf.concat([
+        boxes[:, :, 0:2],
+        tf.expand_dims(box_height, -1),
+        tf.expand_dims(box_width, -1)
+    ],
+                      axis=-1)
+    # Maps levels to [0, max_level-min_level].
+    levels -= min_level
+    level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
+    boundary = tf.cast(
+        tf.concat([
+            tf.expand_dims(
+                [[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1,
+                axis=-1),
+            tf.expand_dims(
+                [[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1,
+                axis=-1),
+        ],
+                  axis=-1), boxes.dtype)
+    # Compute grid positions.
+    kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
+        boxes, boundary, output_size, sample_offset=0.5)
+    x_indices = tf.cast(
+        tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
+        dtype=tf.int32)
+    y_indices = tf.cast(
+        tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
+        dtype=tf.int32)
+    batch_size_offset = tf.tile(
+        tf.reshape(
+            tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
+        [1, num_boxes, output_size * 2, output_size * 2])
+    # Get level offset for each box. Each box belongs to one level.
+    levels_offset = tf.tile(
+        tf.reshape(
+            tf.gather(level_dim_offsets, levels),
+            [batch_size, num_boxes, 1, 1]),
+        [1, 1, output_size * 2, output_size * 2])
+    y_indices_offset = tf.tile(
+        tf.reshape(
+            y_indices * tf.expand_dims(tf.gather(height_dim_sizes, levels), -1),
+            [batch_size, num_boxes, output_size * 2, 1]),
+        [1, 1, 1, output_size * 2])
+    x_indices_offset = tf.tile(
+        tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
+        [1, 1, output_size * 2, 1])
+    indices = tf.reshape(
+        batch_size_offset + levels_offset + y_indices_offset + x_indices_offset,
+        [-1])
+    # TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
+    # performance.
+    features_per_box = tf.reshape(
+        tf.gather(features_r2, indices),
+        [batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
+    # Bilinear interpolation.
+    features_per_box = feature_bilinear_interpolation(features_per_box,
+                                                      kernel_y, kernel_x)
+    return features_per_box
+def single_level_feature_crop(features, level_boxes, detection_prior_levels,
+                              min_mask_level, mask_crop_size):
+  """Crop the FPN features at the appropriate levels for each detection.
+  Args:
+    features: a float tensor of shape [batch_size, num_levels, max_feature_size,
+      max_feature_size, num_downsample_channels].
+    level_boxes: a float Tensor of the level boxes to crop from. [batch_size,
+      num_instances, 4].
+    detection_prior_levels: an int Tensor of instance assigned level of shape
+      [batch_size, num_instances].
+    min_mask_level: minimum FPN level to crop mask feature from.
+    mask_crop_size: an int of mask crop size.
+  Returns:
+    crop_features: a float Tensor of shape [batch_size * num_instances,
+        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
+        instance feature crop.
+  """
+  (batch_size, num_levels, max_feature_size, _,
+   num_downsample_channels) = features.get_shape().as_list()
+  _, num_of_instances, _ = level_boxes.get_shape().as_list()
+  level_boxes = tf.cast(level_boxes, tf.int32)
+  assert num_of_instances == detection_prior_levels.get_shape().as_list()[1]
+  x_start_indices = level_boxes[:, :, 1]
+  y_start_indices = level_boxes[:, :, 0]
+  # generate the full indices (not just the starting index)
+  x_idx_list = []
+  y_idx_list = []
+  for i in range(mask_crop_size):
+    x_idx_list.append(x_start_indices + i)
+    y_idx_list.append(y_start_indices + i)
+  x_indices = tf.stack(x_idx_list, axis=2)
+  y_indices = tf.stack(y_idx_list, axis=2)
+  levels = detection_prior_levels - min_mask_level
+  height_dim_size = max_feature_size
+  level_dim_size = max_feature_size * height_dim_size
+  batch_dim_size = num_levels * level_dim_size
+  # TODO(weicheng) change this to gather_nd for better readability.
+  indices = tf.reshape(
+      tf.tile(
+          tf.reshape(
+              tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
+          [1, num_of_instances, mask_crop_size, mask_crop_size]) + tf.tile(
+              tf.reshape(levels * level_dim_size,
+                         [batch_size, num_of_instances, 1, 1]),
+              [1, 1, mask_crop_size, mask_crop_size]) + tf.tile(
+                  tf.reshape(y_indices * height_dim_size,
+                             [batch_size, num_of_instances, mask_crop_size, 1]),
+                  [1, 1, 1, mask_crop_size]) +
+      tf.tile(
+          tf.reshape(x_indices,
+                     [batch_size, num_of_instances, 1, mask_crop_size]),
+          [1, 1, mask_crop_size, 1]), [-1])
+  features_r2 = tf.reshape(features, [-1, num_downsample_channels])
+  crop_features = tf.reshape(
+      tf.gather(features_r2, indices), [
+          batch_size * num_of_instances, mask_crop_size, mask_crop_size,
+          num_downsample_channels
+      ])
+  return crop_features
+def crop_mask_in_target_box(masks,
+                            boxes,
+                            target_boxes,
+                            output_size,
+                            sample_offset=0,
+                            use_einsum=True):
+  """Crop masks in target boxes.
+  Args:
+    masks: A tensor with a shape of [batch_size, num_masks, height, width].
+    boxes: a float tensor representing box cooridnates that tightly enclose
+      masks with a shape of [batch_size, num_masks, 4] in un-normalized
+      coordinates. A box is represented by [ymin, xmin, ymax, xmax].
+    target_boxes: a float tensor representing target box cooridnates for masks
+      with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A
+      box is represented by [ymin, xmin, ymax, xmax].
+    output_size: A scalar to indicate the output crop size. It currently only
+      supports to output a square shape outputs.
+    sample_offset: a float number in [0, 1] indicates the subpixel sample offset
+      from grid point.
+    use_einsum: Use einsum to replace gather in selective_crop_and_resize.
+  Returns:
+    A 4-D tensor representing feature crop of shape
+    [batch_size, num_boxes, output_size, output_size].
+  """
+  with tf.name_scope('crop_mask_in_target_box'):
+    batch_size, num_masks, height, width = masks.get_shape().as_list()
+    masks = tf.reshape(masks, [batch_size * num_masks, height, width, 1])
+    # Pad zeros on the boundary of masks.
+    masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4)
+    masks = tf.reshape(masks, [batch_size, num_masks, height + 4, width + 4, 1])
+    # Projects target box locations and sizes to corresponding cropped
+    # mask coordinates.
+    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
+        value=boxes, num_or_size_splits=4, axis=2)
+    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
+        value=target_boxes, num_or_size_splits=4, axis=2)
+    y_transform = (bb_y_min - gt_y_min) * height / (gt_y_max - gt_y_min +
+                                                    _EPSILON) + 2
+    x_transform = (bb_x_min - gt_x_min) * height / (gt_x_max - gt_x_min +
+                                                    _EPSILON) + 2
+    h_transform = (bb_y_max - bb_y_min) * width / (
+        gt_y_max - gt_y_min + _EPSILON)
+    w_transform = (bb_x_max - bb_x_min) * width / (
+        gt_x_max - gt_x_min + _EPSILON)
+    boundaries = tf.concat([
+        tf.cast(
+            tf.ones_like(y_transform) * ((height + 4) - 1), dtype=tf.float32),
+        tf.cast(
+            tf.ones_like(x_transform) * ((width + 4) - 1), dtype=tf.float32)
+    ],
+                           axis=-1)
+    # Reshape tensors to have the right shape for selective_crop_and_resize.
+    trasnformed_boxes = tf.concat(
+        [y_transform, x_transform, h_transform, w_transform], -1)
+    levels = tf.tile(
+        tf.reshape(tf.range(num_masks), [1, num_masks]), [batch_size, 1])
+    cropped_masks = selective_crop_and_resize(
+        masks,
+        trasnformed_boxes,
+        levels,
+        boundaries,
+        output_size,
+        sample_offset=sample_offset,
+        use_einsum_gather=use_einsum)
+    cropped_masks = tf.squeeze(cropped_masks, axis=-1)
+  return cropped_masks
--- a/models-2.13.1/official/legacy/detection/ops/target_ops.py
+++ b/models-2.13.1/official/legacy/detection/ops/target_ops.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Target and sampling related ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from official.legacy.detection.ops import spatial_transform_ops
+from official.legacy.detection.utils import box_utils
+from official.vision.utils.object_detection import balanced_positive_negative_sampler
+def box_matching(boxes, gt_boxes, gt_classes):
+  """Match boxes to groundtruth boxes.
+  Given the proposal boxes and the groundtruth boxes and classes, perform the
+  groundtruth matching by taking the argmax of the IoU between boxes and
+  groundtruth boxes.
+  Args:
+    boxes: a tensor of shape of [batch_size, N, 4] representing the box
+      coordiantes to be matched to groundtruth boxes.
+    gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing
+      the groundtruth box coordinates. It is padded with -1s to indicate the
+      invalid boxes.
+    gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
+      classes. It is padded with -1s to indicate the invalid classes.
+  Returns:
+    matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing
+      the matched groundtruth box coordinates for each input box. If the box
+      does not overlap with any groundtruth boxes, the matched boxes of it
+      will be set to all 0s.
+    matched_gt_classes: a tensor of shape of [batch_size, N], representing
+      the matched groundtruth classes for each input box. If the box does not
+      overlap with any groundtruth boxes, the matched box classes of it will
+      be set to 0, which corresponds to the background class.
+    matched_gt_indices: a tensor of shape of [batch_size, N], representing
+      the indices of the matched groundtruth boxes in the original gt_boxes
+      tensor. If the box does not overlap with any groundtruth boxes, the
+      index of the matched groundtruth will be set to -1.
+    matched_iou: a tensor of shape of [batch_size, N], representing the IoU
+      between the box and its matched groundtruth box. The matched IoU is the
+      maximum IoU of the box and all the groundtruth boxes.
+    iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix
+      between boxes and the groundtruth boxes. The IoU between a box and the
+      invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1.
+  """
+  # Compute IoU between boxes and gt_boxes.
+  # iou <- [batch_size, N, K]
+  iou = box_utils.bbox_overlap(boxes, gt_boxes)
+  # max_iou <- [batch_size, N]
+  # 0.0 -> no match to gt, or -1.0 match to no gt
+  matched_iou = tf.reduce_max(iou, axis=-1)
+  # background_box_mask <- bool, [batch_size, N]
+  background_box_mask = tf.less_equal(matched_iou, 0.0)
+  argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32)
+  argmax_iou_indices_shape = tf.shape(argmax_iou_indices)
+  batch_indices = (
+      tf.expand_dims(tf.range(argmax_iou_indices_shape[0]), axis=-1) *
+      tf.ones([1, argmax_iou_indices_shape[-1]], dtype=tf.int32))
+  gather_nd_indices = tf.stack([batch_indices, argmax_iou_indices], axis=-1)
+  matched_gt_boxes = tf.gather_nd(gt_boxes, gather_nd_indices)
+  matched_gt_boxes = tf.where(
+      tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]),
+      tf.zeros_like(matched_gt_boxes, dtype=matched_gt_boxes.dtype),
+      matched_gt_boxes)
+  matched_gt_classes = tf.gather_nd(gt_classes, gather_nd_indices)
+  matched_gt_classes = tf.where(background_box_mask,
+                                tf.zeros_like(matched_gt_classes),
+                                matched_gt_classes)
+  matched_gt_indices = tf.where(background_box_mask,
+                                -tf.ones_like(argmax_iou_indices),
+                                argmax_iou_indices)
+  return (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
+          iou)
+def assign_and_sample_proposals(proposed_boxes,
+                                gt_boxes,
+                                gt_classes,
+                                num_samples_per_image=512,
+                                mix_gt_boxes=True,
+                                fg_fraction=0.25,
+                                fg_iou_thresh=0.5,
+                                bg_iou_thresh_hi=0.5,
+                                bg_iou_thresh_lo=0.0):
+  """Assigns the proposals with groundtruth classes and performs subsmpling.
+  Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
+  following algorithm to generate the final `num_samples_per_image` RoIs.
+    1. Calculates the IoU between each proposal box and each gt_boxes.
+    2. Assigns each proposed box with a groundtruth class and box by choosing
+       the largest IoU overlap.
+    3. Samples `num_samples_per_image` boxes from all proposed boxes, and
+       returns box_targets, class_targets, and RoIs.
+  Args:
+    proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number of
+      proposals before groundtruth assignment. The last dimension is the box
+      coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
+    gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
+      coordinates of gt_boxes are in the pixel coordinates of the scaled image.
+      This tensor might have padding of values -1 indicating the invalid box
+      coordinates.
+    gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
+      tensor might have paddings with values of -1 indicating the invalid
+      classes.
+    num_samples_per_image: a integer represents RoI minibatch size per image.
+    mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes before
+      sampling proposals.
+    fg_fraction: a float represents the target fraction of RoI minibatch that is
+      labeled foreground (i.e., class > 0).
+    fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to be
+      considered foreground (if >= fg_iou_thresh).
+    bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI to
+      be considered background (class = 0 if overlap in [LO, HI)).
+    bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI to
+      be considered background (class = 0 if overlap in [LO, HI)).
+  Returns:
+    sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
+      coordinates of the sampled RoIs, where K is the number of the sampled
+      RoIs, i.e. K = num_samples_per_image.
+    sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
+      box coordinates of the matched groundtruth boxes of the samples RoIs.
+    sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
+      classes of the matched groundtruth boxes of the sampled RoIs.
+    sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
+      indices of the sampled groudntruth boxes in the original `gt_boxes`
+      tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
+  """
+  with tf.name_scope('sample_proposals'):
+    if mix_gt_boxes:
+      boxes = tf.concat([proposed_boxes, gt_boxes], axis=1)
+    else:
+      boxes = proposed_boxes
+    (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
+     _) = box_matching(boxes, gt_boxes, gt_classes)
+    positive_match = tf.greater(matched_iou, fg_iou_thresh)
+    negative_match = tf.logical_and(
+        tf.greater_equal(matched_iou, bg_iou_thresh_lo),
+        tf.less(matched_iou, bg_iou_thresh_hi))
+    ignored_match = tf.less(matched_iou, 0.0)
+    # re-assign negatively matched boxes to the background class.
+    matched_gt_classes = tf.where(negative_match,
+                                  tf.zeros_like(matched_gt_classes),
+                                  matched_gt_classes)
+    matched_gt_indices = tf.where(negative_match,
+                                  tf.zeros_like(matched_gt_indices),
+                                  matched_gt_indices)
+    sample_candidates = tf.logical_and(
+        tf.logical_or(positive_match, negative_match),
+        tf.logical_not(ignored_match))
+    sampler = (
+        balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
+            positive_fraction=fg_fraction, is_static=True))
+    batch_size, _ = sample_candidates.get_shape().as_list()
+    sampled_indicators = []
+    for i in range(batch_size):
+      sampled_indicator = sampler.subsample(sample_candidates[i],
+                                            num_samples_per_image,
+                                            positive_match[i])
+      sampled_indicators.append(sampled_indicator)
+    sampled_indicators = tf.stack(sampled_indicators)
+    _, sampled_indices = tf.nn.top_k(
+        tf.cast(sampled_indicators, dtype=tf.int32),
+        k=num_samples_per_image,
+        sorted=True)
+    sampled_indices_shape = tf.shape(sampled_indices)
+    batch_indices = (
+        tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) *
+        tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32))
+    gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1)
+    sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
+    sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
+    sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices)
+    sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices)
+    return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
+            sampled_gt_indices)
+def sample_and_crop_foreground_masks(candidate_rois,
+                                     candidate_gt_boxes,
+                                     candidate_gt_classes,
+                                     candidate_gt_indices,
+                                     gt_masks,
+                                     num_mask_samples_per_image=128,
+                                     mask_target_size=28):
+  """Samples and creates cropped foreground masks for training.
+  Args:
+    candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
+      number of candidate RoIs to be considered for mask sampling. It includes
+      both positive and negative RoIs. The `num_mask_samples_per_image` positive
+      RoIs will be sampled to create mask training targets.
+    candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
+      corresponding groundtruth boxes to the `candidate_rois`.
+    candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
+      corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
+      corresponds to the background class, i.e. negative RoIs.
+    candidate_gt_indices: a tensor of shape [batch_size, N], storing the
+      corresponding groundtruth instance indices to the `candidate_gt_boxes`,
+      i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
+        gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
+        the superset of candidate_gt_boxes.
+    gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
+      containing all the groundtruth masks which sample masks are drawn from.
+    num_mask_samples_per_image: an integer which specifies the number of masks
+      to sample.
+    mask_target_size: an integer which specifies the final cropped mask size
+      after sampling. The output masks are resized w.r.t the sampled RoIs.
+  Returns:
+    foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
+      that corresponds to the sampled foreground masks, where
+      K = num_mask_samples_per_image.
+    foreground_classes: a tensor of shape of [batch_size, K] storing the classes
+      corresponding to the sampled foreground masks.
+    cropoped_foreground_masks: a tensor of shape of
+      [batch_size, K, mask_target_size, mask_target_size] storing the cropped
+      foreground masks used for training.
+  """
+  with tf.name_scope('sample_and_crop_foreground_masks'):
+    _, fg_instance_indices = tf.nn.top_k(
+        tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32),
+        k=num_mask_samples_per_image)
+    fg_instance_indices_shape = tf.shape(fg_instance_indices)
+    batch_indices = (
+        tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
+        tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
+    gather_nd_instance_indices = tf.stack([batch_indices, fg_instance_indices],
+                                          axis=-1)
+    foreground_rois = tf.gather_nd(candidate_rois, gather_nd_instance_indices)
+    foreground_boxes = tf.gather_nd(candidate_gt_boxes,
+                                    gather_nd_instance_indices)
+    foreground_classes = tf.gather_nd(candidate_gt_classes,
+                                      gather_nd_instance_indices)
+    foreground_gt_indices = tf.gather_nd(candidate_gt_indices,
+                                         gather_nd_instance_indices)
+    foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
+    batch_indices = (
+        tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
+        tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
+    gather_nd_gt_indices = tf.stack([batch_indices, foreground_gt_indices],
+                                    axis=-1)
+    foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
+    cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
+        foreground_masks,
+        foreground_boxes,
+        foreground_rois,
+        mask_target_size,
+        sample_offset=0.5)
+    return foreground_rois, foreground_classes, cropped_foreground_masks
+class ROISampler(tf.keras.layers.Layer):
+  """Samples RoIs and creates training targets."""
+  def __init__(self, params):
+    self._num_samples_per_image = params.num_samples_per_image
+    self._fg_fraction = params.fg_fraction
+    self._fg_iou_thresh = params.fg_iou_thresh
+    self._bg_iou_thresh_hi = params.bg_iou_thresh_hi
+    self._bg_iou_thresh_lo = params.bg_iou_thresh_lo
+    self._mix_gt_boxes = params.mix_gt_boxes
+    super(ROISampler, self).__init__(autocast=False)
+  def call(self, rois, gt_boxes, gt_classes):
+    """Sample and assign RoIs for training.
+    Args:
+      rois: a tensor of shape of [batch_size, N, 4]. N is the number of
+        proposals before groundtruth assignment. The last dimension is the box
+        coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
+      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
+        coordinates of gt_boxes are in the pixel coordinates of the scaled
+        image. This tensor might have padding of values -1 indicating the
+        invalid box coordinates.
+      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
+        tensor might have paddings with values of -1 indicating the invalid
+        classes.
+    Returns:
+      sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
+        coordinates of the sampled RoIs, where K is the number of the sampled
+        RoIs, i.e. K = num_samples_per_image.
+      sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
+        box coordinates of the matched groundtruth boxes of the samples RoIs.
+      sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
+        classes of the matched groundtruth boxes of the sampled RoIs.
+    """
+    sampled_rois, sampled_gt_boxes, sampled_gt_classes, sampled_gt_indices = (
+        assign_and_sample_proposals(
+            rois,
+            gt_boxes,
+            gt_classes,
+            num_samples_per_image=self._num_samples_per_image,
+            mix_gt_boxes=self._mix_gt_boxes,
+            fg_fraction=self._fg_fraction,
+            fg_iou_thresh=self._fg_iou_thresh,
+            bg_iou_thresh_hi=self._bg_iou_thresh_hi,
+            bg_iou_thresh_lo=self._bg_iou_thresh_lo))
+    return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
+            sampled_gt_indices)
+class ROIScoreSampler(ROISampler):
+  """Samples RoIs, RoI-scores and creates training targets."""
+  def __call__(self, rois, roi_scores, gt_boxes, gt_classes):
+    """Sample and assign RoIs for training.
+    Args:
+      rois: a tensor of shape of [batch_size, N, 4]. N is the number of
+        proposals before groundtruth assignment. The last dimension is the box
+        coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
+      roi_scores:
+      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
+        coordinates of gt_boxes are in the pixel coordinates of the scaled
+        image. This tensor might have padding of values -1 indicating the
+        invalid box coordinates.
+      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
+        tensor might have paddings with values of -1 indicating the invalid
+        classes.
+    Returns:
+      sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
+        coordinates of the sampled RoIs, where K is the number of the sampled
+        RoIs, i.e. K = num_samples_per_image.
+      sampled_roi_scores:
+      sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
+        box coordinates of the matched groundtruth boxes of the samples RoIs.
+      sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
+        classes of the matched groundtruth boxes of the sampled RoIs.
+    """
+    (sampled_rois, sampled_roi_scores, sampled_gt_boxes, sampled_gt_classes,
+     sampled_gt_indices) = (
+         self.assign_and_sample_proposals_and_scores(
+             rois,
+             roi_scores,
+             gt_boxes,
+             gt_classes,
+             num_samples_per_image=self._num_samples_per_image,
+             mix_gt_boxes=self._mix_gt_boxes,
+             fg_fraction=self._fg_fraction,
+             fg_iou_thresh=self._fg_iou_thresh,
+             bg_iou_thresh_hi=self._bg_iou_thresh_hi,
+             bg_iou_thresh_lo=self._bg_iou_thresh_lo))
+    return (sampled_rois, sampled_roi_scores, sampled_gt_boxes,
+            sampled_gt_classes, sampled_gt_indices)
+  def assign_and_sample_proposals_and_scores(self,
+                                             proposed_boxes,
+                                             proposed_scores,
+                                             gt_boxes,
+                                             gt_classes,
+                                             num_samples_per_image=512,
+                                             mix_gt_boxes=True,
+                                             fg_fraction=0.25,
+                                             fg_iou_thresh=0.5,
+                                             bg_iou_thresh_hi=0.5,
+                                             bg_iou_thresh_lo=0.0):
+    """Assigns the proposals with groundtruth classes and performs subsmpling.
+    Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
+    following algorithm to generate the final `num_samples_per_image` RoIs.
+      1. Calculates the IoU between each proposal box and each gt_boxes.
+      2. Assigns each proposed box with a groundtruth class and box by choosing
+         the largest IoU overlap.
+      3. Samples `num_samples_per_image` boxes from all proposed boxes, and
+         returns box_targets, class_targets, and RoIs.
+    Args:
+      proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number
+        of proposals before groundtruth assignment. The last dimension is the
+        box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
+        format.
+      proposed_scores: a tensor of shape of [batch_size, N]. N is the number of
+        proposals before groundtruth assignment. It is the rpn scores for all
+        proposed boxes which can be either their classification or centerness
+        scores.
+      gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
+        coordinates of gt_boxes are in the pixel coordinates of the scaled
+        image. This tensor might have padding of values -1 indicating the
+        invalid box coordinates.
+      gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
+        tensor might have paddings with values of -1 indicating the invalid
+        classes.
+      num_samples_per_image: a integer represents RoI minibatch size per image.
+      mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes
+      before sampling proposals.
+      fg_fraction: a float represents the target fraction of RoI minibatch that
+        is labeled foreground (i.e., class > 0).
+      fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to
+        be considered foreground (if >= fg_iou_thresh).
+      bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI
+        to be considered background (class = 0 if overlap in [LO, HI)).
+      bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI
+        to be considered background (class = 0 if overlap in [LO, HI)).
+    Returns:
+      sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
+        coordinates of the sampled RoIs, where K is the number of the sampled
+        RoIs, i.e. K = num_samples_per_image.
+      sampled_scores: a tensor of shape of [batch_size, K], representing the
+        confidence score of the sampled RoIs, where K is the number of the
+        sampled RoIs, i.e. K = num_samples_per_image.
+      sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
+        box coordinates of the matched groundtruth boxes of the samples RoIs.
+      sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
+        classes of the matched groundtruth boxes of the sampled RoIs.
+      sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
+        indices of the sampled groudntruth boxes in the original `gt_boxes`
+        tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] =
+        sampled_gt_boxes[:, i].
+    """
+    with tf.name_scope('sample_proposals_and_scores'):
+      if mix_gt_boxes:
+        boxes = tf.concat([proposed_boxes, gt_boxes], axis=1)
+        gt_scores = tf.ones_like(gt_boxes[:, :, 0])
+        scores = tf.concat([proposed_scores, gt_scores], axis=1)
+      else:
+        boxes = proposed_boxes
+        scores = proposed_scores
+      (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
+       _) = box_matching(boxes, gt_boxes, gt_classes)
+      positive_match = tf.greater(matched_iou, fg_iou_thresh)
+      negative_match = tf.logical_and(
+          tf.greater_equal(matched_iou, bg_iou_thresh_lo),
+          tf.less(matched_iou, bg_iou_thresh_hi))
+      ignored_match = tf.less(matched_iou, 0.0)
+      # re-assign negatively matched boxes to the background class.
+      matched_gt_classes = tf.where(negative_match,
+                                    tf.zeros_like(matched_gt_classes),
+                                    matched_gt_classes)
+      matched_gt_indices = tf.where(negative_match,
+                                    tf.zeros_like(matched_gt_indices),
+                                    matched_gt_indices)
+      sample_candidates = tf.logical_and(
+          tf.logical_or(positive_match, negative_match),
+          tf.logical_not(ignored_match))
+      sampler = (
+          balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
+              positive_fraction=fg_fraction, is_static=True))
+      batch_size, _ = sample_candidates.get_shape().as_list()
+      sampled_indicators = []
+      for i in range(batch_size):
+        sampled_indicator = sampler.subsample(sample_candidates[i],
+                                              num_samples_per_image,
+                                              positive_match[i])
+        sampled_indicators.append(sampled_indicator)
+      sampled_indicators = tf.stack(sampled_indicators)
+      _, sampled_indices = tf.nn.top_k(
+          tf.cast(sampled_indicators, dtype=tf.int32),
+          k=num_samples_per_image,
+          sorted=True)
+      sampled_indices_shape = tf.shape(sampled_indices)
+      batch_indices = (
+          tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) *
+          tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32))
+      gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1)
+      sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
+      sampled_roi_scores = tf.gather_nd(scores, gather_nd_indices)
+      sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
+      sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices)
+      sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices)
+      return (sampled_rois, sampled_roi_scores, sampled_gt_boxes,
+              sampled_gt_classes, sampled_gt_indices)
+class MaskSampler(tf.keras.layers.Layer):
+  """Samples and creates mask training targets."""
+  def __init__(self, mask_target_size, num_mask_samples_per_image):
+    self._mask_target_size = mask_target_size
+    self._num_mask_samples_per_image = num_mask_samples_per_image
+    super(MaskSampler, self).__init__(autocast=False)
+  def call(self,
+           candidate_rois,
+           candidate_gt_boxes,
+           candidate_gt_classes,
+           candidate_gt_indices,
+           gt_masks):
+    """Sample and create mask targets for training.
+    Args:
+      candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
+        number of candidate RoIs to be considered for mask sampling. It includes
+        both positive and negative RoIs. The `num_mask_samples_per_image`
+        positive RoIs will be sampled to create mask training targets.
+      candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
+        corresponding groundtruth boxes to the `candidate_rois`.
+      candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
+        corresponding groundtruth classes to the `candidate_rois`. 0 in the
+        tensor corresponds to the background class, i.e. negative RoIs.
+      candidate_gt_indices: a tensor of shape [batch_size, N], storing the
+        corresponding groundtruth instance indices to the `candidate_gt_boxes`,
+        i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
+          where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
+          N, is the superset of candidate_gt_boxes.
+      gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
+        containing all the groundtruth masks which sample masks are drawn from.
+        after sampling. The output masks are resized w.r.t the sampled RoIs.
+    Returns:
+      foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
+        that corresponds to the sampled foreground masks, where
+        K = num_mask_samples_per_image.
+      foreground_classes: a tensor of shape of [batch_size, K] storing the
+        classes corresponding to the sampled foreground masks.
+      cropoped_foreground_masks: a tensor of shape of
+        [batch_size, K, mask_target_size, mask_target_size] storing the
+        cropped foreground masks used for training.
+    """
+    foreground_rois, foreground_classes, cropped_foreground_masks = (
+        sample_and_crop_foreground_masks(candidate_rois, candidate_gt_boxes,
+                                         candidate_gt_classes,
+                                         candidate_gt_indices, gt_masks,
+                                         self._num_mask_samples_per_image,
+                                         self._mask_target_size))
+    return foreground_rois, foreground_classes, cropped_foreground_masks
--- a/models-2.13.1/official/legacy/detection/utils/__init__.py
+++ b/models-2.13.1/official/legacy/detection/utils/__init__.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/models-2.13.1/official/legacy/detection/utils/box_utils.py
+++ b/models-2.13.1/official/legacy/detection/utils/box_utils.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for bounding box processing."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import tensorflow as tf
+EPSILON = 1e-8
+BBOX_XFORM_CLIP = np.log(1000. / 16.)
+def visualize_images_with_bounding_boxes(images, box_outputs, step,
+                                         summary_writer):
+  """Records subset of evaluation images with bounding boxes."""
+  image_shape = tf.shape(images[0])
+  image_height = tf.cast(image_shape[0], tf.float32)
+  image_width = tf.cast(image_shape[1], tf.float32)
+  normalized_boxes = normalize_boxes(box_outputs, [image_height, image_width])
+  bounding_box_color = tf.constant([[1.0, 1.0, 0.0, 1.0]])
+  image_summary = tf.image.draw_bounding_boxes(images, normalized_boxes,
+                                               bounding_box_color)
+  with summary_writer.as_default():
+    tf.summary.image('bounding_box_summary', image_summary, step=step)
+    summary_writer.flush()
+def yxyx_to_xywh(boxes):
+  """Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height.
+  Args:
+    boxes: a numpy array whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+  Returns:
+    boxes: a numpy array whose shape is the same as `boxes` in new format.
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+  boxes_ymin = boxes[..., 0]
+  boxes_xmin = boxes[..., 1]
+  boxes_width = boxes[..., 3] - boxes[..., 1]
+  boxes_height = boxes[..., 2] - boxes[..., 0]
+  new_boxes = np.stack([boxes_xmin, boxes_ymin, boxes_width, boxes_height],
+                       axis=-1)
+  return new_boxes
+def jitter_boxes(boxes, noise_scale=0.025):
+  """Jitter the box coordinates by some noise distribution.
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    noise_scale: a python float which specifies the magnitude of noise. The rule
+      of thumb is to set this between (0, 0.1]. The default value is found to
+      mimic the noisy detections best empirically.
+  Returns:
+    jittered_boxes: a tensor whose shape is the same as `boxes` representing
+      the jittered boxes.
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+  with tf.name_scope('jitter_boxes'):
+    bbox_jitters = tf.random.normal(boxes.get_shape(), stddev=noise_scale)
+    ymin = boxes[..., 0:1]
+    xmin = boxes[..., 1:2]
+    ymax = boxes[..., 2:3]
+    xmax = boxes[..., 3:4]
+    width = xmax - xmin
+    height = ymax - ymin
+    new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width
+    new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height
+    new_width = width * tf.math.exp(bbox_jitters[..., 2:3])
+    new_height = height * tf.math.exp(bbox_jitters[..., 3:4])
+    jittered_boxes = tf.concat([
+        new_center_y - new_height * 0.5, new_center_x - new_width * 0.5,
+        new_center_y + new_height * 0.5, new_center_x + new_width * 0.5
+    ],
+                               axis=-1)
+    return jittered_boxes
+def normalize_boxes(boxes, image_shape):
+  """Converts boxes to the normalized coordinates.
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+  Returns:
+    normalized_boxes: a tensor whose shape is the same as `boxes` representing
+      the normalized boxes.
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+  with tf.name_scope('normalize_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height = image_shape[..., 0:1]
+      width = image_shape[..., 1:2]
+    ymin = boxes[..., 0:1] / height
+    xmin = boxes[..., 1:2] / width
+    ymax = boxes[..., 2:3] / height
+    xmax = boxes[..., 3:4] / width
+    normalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
+    return normalized_boxes
+def denormalize_boxes(boxes, image_shape):
+  """Converts boxes normalized by [height, width] to pixel coordinates.
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+  Returns:
+    denormalized_boxes: a tensor whose shape is the same as `boxes` representing
+      the denormalized boxes.
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  with tf.name_scope('denormalize_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height, width = tf.split(image_shape, 2, axis=-1)
+    ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
+    ymin = ymin * height
+    xmin = xmin * width
+    ymax = ymax * height
+    xmax = xmax * width
+    denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
+    return denormalized_boxes
+def clip_boxes(boxes, image_shape):
+  """Clips boxes to image boundaries.
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+  Returns:
+    clipped_boxes: a tensor whose shape is the same as `boxes` representing the
+      clipped boxes.
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+  with tf.name_scope('clip_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+      max_length = [height - 1.0, width - 1.0, height - 1.0, width - 1.0]
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height, width = tf.unstack(image_shape, axis=-1)
+      max_length = tf.stack(
+          [height - 1.0, width - 1.0, height - 1.0, width - 1.0], axis=-1)
+    clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
+    return clipped_boxes
+def compute_outer_boxes(boxes, image_shape, scale=1.0):
+  """Compute outer box encloses an object with a margin.
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    image_shape: a list of two integers, a two-element vector or a tensor such
+      that all but the last dimensions are `broadcastable` to `boxes`. The last
+      dimension is 2, which represents [height, width].
+    scale: a float number specifying the scale of output outer boxes to input
+      `boxes`.
+  Returns:
+    outer_boxes: a tensor whose shape is the same as `boxes` representing the
+      outer boxes.
+  """
+  if scale < 1.0:
+    raise ValueError(
+        'scale is {}, but outer box scale must be greater than 1.0.'.format(
+            scale))
+  centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0
+  centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0
+  box_height = (boxes[..., 2] - boxes[..., 0]) * scale
+  box_width = (boxes[..., 3] - boxes[..., 1]) * scale
+  outer_boxes = tf.stack([
+      centers_y - box_height / 2.0, centers_x - box_width / 2.0,
+      centers_y + box_height / 2.0, centers_x + box_width / 2.0
+  ],
+                         axis=1)
+  outer_boxes = clip_boxes(outer_boxes, image_shape)
+  return outer_boxes
+def encode_boxes(boxes, anchors, weights=None):
+  """Encode boxes to targets.
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
+      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
+    weights: None or a list of four float numbers used to scale coordinates.
+  Returns:
+    encoded_boxes: a tensor whose shape is the same as `boxes` representing the
+      encoded box targets.
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+  with tf.name_scope('encode_boxes'):
+    boxes = tf.cast(boxes, dtype=anchors.dtype)
+    ymin = boxes[..., 0:1]
+    xmin = boxes[..., 1:2]
+    ymax = boxes[..., 2:3]
+    xmax = boxes[..., 3:4]
+    box_h = ymax - ymin + 1.0
+    box_w = xmax - xmin + 1.0
+    box_yc = ymin + 0.5 * box_h
+    box_xc = xmin + 0.5 * box_w
+    anchor_ymin = anchors[..., 0:1]
+    anchor_xmin = anchors[..., 1:2]
+    anchor_ymax = anchors[..., 2:3]
+    anchor_xmax = anchors[..., 3:4]
+    anchor_h = anchor_ymax - anchor_ymin + 1.0
+    anchor_w = anchor_xmax - anchor_xmin + 1.0
+    anchor_yc = anchor_ymin + 0.5 * anchor_h
+    anchor_xc = anchor_xmin + 0.5 * anchor_w
+    encoded_dy = (box_yc - anchor_yc) / anchor_h
+    encoded_dx = (box_xc - anchor_xc) / anchor_w
+    encoded_dh = tf.math.log(box_h / anchor_h)
+    encoded_dw = tf.math.log(box_w / anchor_w)
+    if weights:
+      encoded_dy *= weights[0]
+      encoded_dx *= weights[1]
+      encoded_dh *= weights[2]
+      encoded_dw *= weights[3]
+    encoded_boxes = tf.concat([encoded_dy, encoded_dx, encoded_dh, encoded_dw],
+                              axis=-1)
+    return encoded_boxes
+def decode_boxes(encoded_boxes, anchors, weights=None):
+  """Decode boxes.
+  Args:
+    encoded_boxes: a tensor whose last dimension is 4 representing the
+      coordinates of encoded boxes in ymin, xmin, ymax, xmax order.
+    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
+      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
+    weights: None or a list of four float numbers used to scale coordinates.
+  Returns:
+    encoded_boxes: a tensor whose shape is the same as `boxes` representing the
+      decoded box targets.
+  """
+  if encoded_boxes.shape[-1] != 4:
+    raise ValueError('encoded_boxes.shape[-1] is {:d}, but must be 4.'.format(
+        encoded_boxes.shape[-1]))
+  with tf.name_scope('decode_boxes'):
+    encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype)
+    dy = encoded_boxes[..., 0:1]
+    dx = encoded_boxes[..., 1:2]
+    dh = encoded_boxes[..., 2:3]
+    dw = encoded_boxes[..., 3:4]
+    if weights:
+      dy /= weights[0]
+      dx /= weights[1]
+      dh /= weights[2]
+      dw /= weights[3]
+    dh = tf.math.minimum(dh, BBOX_XFORM_CLIP)
+    dw = tf.math.minimum(dw, BBOX_XFORM_CLIP)
+    anchor_ymin = anchors[..., 0:1]
+    anchor_xmin = anchors[..., 1:2]
+    anchor_ymax = anchors[..., 2:3]
+    anchor_xmax = anchors[..., 3:4]
+    anchor_h = anchor_ymax - anchor_ymin + 1.0
+    anchor_w = anchor_xmax - anchor_xmin + 1.0
+    anchor_yc = anchor_ymin + 0.5 * anchor_h
+    anchor_xc = anchor_xmin + 0.5 * anchor_w
+    decoded_boxes_yc = dy * anchor_h + anchor_yc
+    decoded_boxes_xc = dx * anchor_w + anchor_xc
+    decoded_boxes_h = tf.math.exp(dh) * anchor_h
+    decoded_boxes_w = tf.math.exp(dw) * anchor_w
+    decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h
+    decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w
+    decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h - 1.0
+    decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w - 1.0
+    decoded_boxes = tf.concat([
+        decoded_boxes_ymin, decoded_boxes_xmin, decoded_boxes_ymax,
+        decoded_boxes_xmax
+    ],
+                              axis=-1)
+    return decoded_boxes
+def encode_boxes_lrtb(boxes, anchors, weights=None):
+  """Encode boxes to targets on lrtb (=left,right,top,bottom) format.
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates
+      of boxes in ymin, xmin, ymax, xmax order.
+    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
+      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
+    weights: None or a list of four float numbers used to scale coordinates.
+  Returns:
+    encoded_boxes_lrtb: a tensor whose shape is the same as `boxes` representing
+      the encoded box targets. The box targets encode the left, right, top,
+      bottom distances from an anchor location to the four borders of the
+      matched groundtruth bounding box.
+    center_targets: centerness targets defined by the left, right, top, and
+      bottom distance targets. The centerness is defined as the deviation of the
+      anchor location from the groundtruth object center. Formally, centerness =
+      sqrt(min(left, right)/max(left, right)*min(top, bottom)/max(top, bottom)).
+  Raises:
+    ValueError: If the last dimension of boxes is not 4.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError(
+        'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
+  with tf.name_scope('encode_boxes_lrtb'):
+    boxes = tf.cast(boxes, dtype=anchors.dtype)
+    ymin = boxes[..., 0:1]
+    xmin = boxes[..., 1:2]
+    ymax = boxes[..., 2:3]
+    xmax = boxes[..., 3:4]
+    # box_h = ymax - ymin + 1.0
+    # box_w = xmax - xmin + 1.0
+    box_h = ymax - ymin
+    box_w = xmax - xmin
+    anchor_ymin = anchors[..., 0:1]
+    anchor_xmin = anchors[..., 1:2]
+    anchor_ymax = anchors[..., 2:3]
+    anchor_xmax = anchors[..., 3:4]
+    # anchor_h = anchor_ymax - anchor_ymin + 1.0
+    # anchor_w = anchor_xmax - anchor_xmin + 1.0
+    anchor_h = anchor_ymax - anchor_ymin
+    anchor_w = anchor_xmax - anchor_xmin
+    anchor_yc = anchor_ymin + 0.5 * anchor_h
+    anchor_xc = anchor_xmin + 0.5 * anchor_w
+    box_h += EPSILON
+    box_w += EPSILON
+    anchor_h += EPSILON
+    anchor_w += EPSILON
+    left = (anchor_xc - xmin) / anchor_w
+    right = (xmax - anchor_xc) / anchor_w
+    top = (anchor_yc - ymin) / anchor_h
+    bottom = (ymax - anchor_yc) / anchor_h
+    # Create centerness target. {
+    lrtb_targets = tf.concat([left, right, top, bottom], axis=-1)
+    valid_match = tf.greater(tf.reduce_min(lrtb_targets, -1), 0.0)
+    # Centerness score.
+    left_right = tf.concat([left, right], axis=-1)
+    left_right = tf.where(tf.stack([valid_match, valid_match], -1),
+                          left_right, tf.zeros_like(left_right))
+    top_bottom = tf.concat([top, bottom], axis=-1)
+    top_bottom = tf.where(tf.stack([valid_match, valid_match], -1),
+                          top_bottom, tf.zeros_like(top_bottom))
+    center_targets = tf.sqrt(
+        (tf.reduce_min(left_right, -1) /
+         (tf.reduce_max(left_right, -1) + EPSILON)) *
+        (tf.reduce_min(top_bottom, -1) /
+         (tf.reduce_max(top_bottom, -1) + EPSILON)))
+    center_targets = tf.where(valid_match,
+                              center_targets,
+                              tf.zeros_like(center_targets))
+    if weights:
+      left *= weights[0]
+      right *= weights[1]
+      top *= weights[2]
+      bottom *= weights[3]
+    encoded_boxes_lrtb = tf.concat(
+        [left, right, top, bottom],
+        axis=-1)
+    return encoded_boxes_lrtb, center_targets
+def decode_boxes_lrtb(encoded_boxes_lrtb, anchors, weights=None):
+  """Decode boxes.
+  Args:
+    encoded_boxes_lrtb: a tensor whose last dimension is 4 representing the
+      coordinates of encoded boxes in left, right, top, bottom order.
+    anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
+      representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
+    weights: None or a list of four float numbers used to scale coordinates.
+  Returns:
+    decoded_boxes_lrtb: a tensor whose shape is the same as `boxes` representing
+      the decoded box targets in lrtb (=left,right,top,bottom) format. The box
+      decoded box coordinates represent the left, right, top, and bottom
+      distances from an anchor location to the four borders of the matched
+      groundtruth bounding box.
+  """
+  if encoded_boxes_lrtb.shape[-1] != 4:
+    raise ValueError(
+        'encoded_boxes_lrtb.shape[-1] is {:d}, but must be 4.'
+        .format(encoded_boxes_lrtb.shape[-1]))
+  with tf.name_scope('decode_boxes_lrtb'):
+    encoded_boxes_lrtb = tf.cast(encoded_boxes_lrtb, dtype=anchors.dtype)
+    left = encoded_boxes_lrtb[..., 0:1]
+    right = encoded_boxes_lrtb[..., 1:2]
+    top = encoded_boxes_lrtb[..., 2:3]
+    bottom = encoded_boxes_lrtb[..., 3:4]
+    if weights:
+      left /= weights[0]
+      right /= weights[1]
+      top /= weights[2]
+      bottom /= weights[3]
+    anchor_ymin = anchors[..., 0:1]
+    anchor_xmin = anchors[..., 1:2]
+    anchor_ymax = anchors[..., 2:3]
+    anchor_xmax = anchors[..., 3:4]
+    anchor_h = anchor_ymax - anchor_ymin
+    anchor_w = anchor_xmax - anchor_xmin
+    anchor_yc = anchor_ymin + 0.5 * anchor_h
+    anchor_xc = anchor_xmin + 0.5 * anchor_w
+    anchor_h += EPSILON
+    anchor_w += EPSILON
+    decoded_boxes_ymin = anchor_yc - top * anchor_h
+    decoded_boxes_xmin = anchor_xc - left * anchor_w
+    decoded_boxes_ymax = anchor_yc + bottom * anchor_h
+    decoded_boxes_xmax = anchor_xc + right * anchor_w
+    decoded_boxes_lrtb = tf.concat(
+        [decoded_boxes_ymin, decoded_boxes_xmin,
+         decoded_boxes_ymax, decoded_boxes_xmax],
+        axis=-1)
+    return decoded_boxes_lrtb
+def filter_boxes(boxes, scores, image_shape, min_size_threshold):
+  """Filter and remove boxes that are too small or fall outside the image.
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
+      representing the original scores of the boxes.
+    image_shape: a tensor whose shape is the same as, or `broadcastable` to
+      `boxes` except the last dimension, which is 2, representing [height,
+      width] of the scaled image.
+    min_size_threshold: a float representing the minimal box size in each side
+      (w.r.t. the scaled image). Boxes whose sides are smaller than it will be
+      filtered out.
+  Returns:
+    filtered_boxes: a tensor whose shape is the same as `boxes` but with
+      the position of the filtered boxes are filled with 0.
+    filtered_scores: a tensor whose shape is the same as 'scores' but with
+      the positinon of the filtered boxes filled with 0.
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+  with tf.name_scope('filter_boxes'):
+    if isinstance(image_shape, list) or isinstance(image_shape, tuple):
+      height, width = image_shape
+    else:
+      image_shape = tf.cast(image_shape, dtype=boxes.dtype)
+      height = image_shape[..., 0]
+      width = image_shape[..., 1]
+    ymin = boxes[..., 0]
+    xmin = boxes[..., 1]
+    ymax = boxes[..., 2]
+    xmax = boxes[..., 3]
+    h = ymax - ymin + 1.0
+    w = xmax - xmin + 1.0
+    yc = ymin + 0.5 * h
+    xc = xmin + 0.5 * w
+    min_size = tf.cast(
+        tf.math.maximum(min_size_threshold, 1.0), dtype=boxes.dtype)
+    filtered_size_mask = tf.math.logical_and(
+        tf.math.greater(h, min_size), tf.math.greater(w, min_size))
+    filtered_center_mask = tf.logical_and(
+        tf.math.logical_and(tf.math.greater(yc, 0.0), tf.math.less(yc, height)),
+        tf.math.logical_and(tf.math.greater(xc, 0.0), tf.math.less(xc, width)))
+    filtered_mask = tf.math.logical_and(filtered_size_mask,
+                                        filtered_center_mask)
+    filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores))
+    filtered_boxes = tf.cast(
+        tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
+    return filtered_boxes, filtered_scores
+def filter_boxes_by_scores(boxes, scores, min_score_threshold):
+  """Filter and remove boxes whose scores are smaller than the threshold.
+  Args:
+    boxes: a tensor whose last dimension is 4 representing the coordinates of
+      boxes in ymin, xmin, ymax, xmax order.
+    scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
+      representing the original scores of the boxes.
+    min_score_threshold: a float representing the minimal box score threshold.
+      Boxes whose score are smaller than it will be filtered out.
+  Returns:
+    filtered_boxes: a tensor whose shape is the same as `boxes` but with
+      the position of the filtered boxes are filled with -1.
+    filtered_scores: a tensor whose shape is the same as 'scores' but with
+      the
+  """
+  if boxes.shape[-1] != 4:
+    raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
+        boxes.shape[-1]))
+  with tf.name_scope('filter_boxes_by_scores'):
+    filtered_mask = tf.math.greater(scores, min_score_threshold)
+    filtered_scores = tf.where(filtered_mask, scores, -tf.ones_like(scores))
+    filtered_boxes = tf.cast(
+        tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
+    return filtered_boxes, filtered_scores
+def top_k_boxes(boxes, scores, k):
+  """Sort and select top k boxes according to the scores.
+  Args:
+    boxes: a tensor of shape [batch_size, N, 4] representing the coordiante of
+      the boxes. N is the number of boxes per image.
+    scores: a tensor of shsape [batch_size, N] representing the socre of the
+      boxes.
+    k: an integer or a tensor indicating the top k number.
+  Returns:
+    selected_boxes: a tensor of shape [batch_size, k, 4] representing the
+      selected top k box coordinates.
+    selected_scores: a tensor of shape [batch_size, k] representing the selected
+      top k box scores.
+  """
+  with tf.name_scope('top_k_boxes'):
+    selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True)
+    batch_size, _ = scores.get_shape().as_list()
+    if batch_size == 1:
+      selected_boxes = tf.squeeze(
+          tf.gather(boxes, top_k_indices, axis=1), axis=1)
+    else:
+      top_k_indices_shape = tf.shape(top_k_indices)
+      batch_indices = (
+          tf.expand_dims(tf.range(top_k_indices_shape[0]), axis=-1) *
+          tf.ones([1, top_k_indices_shape[-1]], dtype=tf.int32))
+      gather_nd_indices = tf.stack([batch_indices, top_k_indices], axis=-1)
+      selected_boxes = tf.gather_nd(boxes, gather_nd_indices)
+    return selected_boxes, selected_scores
+def bbox_overlap(boxes, gt_boxes):
+  """Calculates the overlap between proposal and ground truth boxes.
+  Some `gt_boxes` may have been padded.  The returned `iou` tensor for these
+  boxes will be -1.
+  Args:
+    boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
+      proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
+      last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
+    gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
+      tensor might have paddings with a negative value.
+  Returns:
+    iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES].
+  """
+  with tf.name_scope('bbox_overlap'):
+    bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
+        value=boxes, num_or_size_splits=4, axis=2)
+    gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
+        value=gt_boxes, num_or_size_splits=4, axis=2)
+    # Calculates the intersection area.
+    i_xmin = tf.math.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
+    i_xmax = tf.math.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
+    i_ymin = tf.math.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
+    i_ymax = tf.math.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
+    i_area = tf.math.maximum((i_xmax - i_xmin), 0) * tf.math.maximum(
+        (i_ymax - i_ymin), 0)
+    # Calculates the union area.
+    bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
+    gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
+    # Adds a small epsilon to avoid divide-by-zero.
+    u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
+    # Calculates IoU.
+    iou = i_area / u_area
+    # Fills -1 for IoU entries between the padded ground truth boxes.
+    gt_invalid_mask = tf.less(
+        tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
+    padding_mask = tf.logical_or(
+        tf.zeros_like(bb_x_min, dtype=tf.bool),
+        tf.transpose(gt_invalid_mask, [0, 2, 1]))
+    iou = tf.where(padding_mask, -tf.ones_like(iou), iou)
+    return iou
+def get_non_empty_box_indices(boxes):
+  """Get indices for non-empty boxes."""
+  # Selects indices if box height or width is 0.
+  height = boxes[:, 2] - boxes[:, 0]
+  width = boxes[:, 3] - boxes[:, 1]
+  indices = tf.where(
+      tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)))
+  return indices[:, 0]
--- a/models-2.13.1/official/legacy/detection/utils/class_utils.py
+++ b/models-2.13.1/official/legacy/detection/utils/class_utils.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for handling dataset object categories."""
+def coco_split_class_ids(split_name):
+  """Return the COCO class split ids based on split name and training mode.
+  Args:
+    split_name: The name of dataset split.
+  Returns:
+    class_ids: a python list of integer.
+  """
+  if split_name == 'all':
+    return []
+  elif split_name == 'voc':
+    return [
+        1, 2, 3, 4, 5, 6, 7, 9, 16, 17, 18, 19, 20, 21, 44, 62, 63, 64, 67, 72
+    ]
+  elif split_name == 'nonvoc':
+    return [
+        8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36,
+        37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+        57, 58, 59, 60, 61, 65, 70, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84,
+        85, 86, 87, 88, 89, 90
+    ]
+  else:
+    raise ValueError('Invalid split name {}!!!'.format(split_name))
--- a/models-2.13.1/official/legacy/detection/utils/dataloader_utils.py
+++ b/models-2.13.1/official/legacy/detection/utils/dataloader_utils.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for dataloader."""
+import tensorflow as tf
+from official.legacy.detection.utils import input_utils
+def process_source_id(source_id):
+  """Processes source_id to the right format."""
+  if source_id.dtype == tf.string:
+    source_id = tf.cast(tf.strings.to_number(source_id), tf.int64)
+  with tf.control_dependencies([source_id]):
+    source_id = tf.cond(
+        pred=tf.equal(tf.size(input=source_id), 0),
+        true_fn=lambda: tf.cast(tf.constant(-1), tf.int64),
+        false_fn=lambda: tf.identity(source_id))
+  return source_id
+def pad_groundtruths_to_fixed_size(gt, n):
+  """Pads the first dimension of groundtruths labels to the fixed size."""
+  gt['boxes'] = input_utils.pad_to_fixed_size(gt['boxes'], n, -1)
+  gt['is_crowds'] = input_utils.pad_to_fixed_size(gt['is_crowds'], n, 0)
+  gt['areas'] = input_utils.pad_to_fixed_size(gt['areas'], n, -1)
+  gt['classes'] = input_utils.pad_to_fixed_size(gt['classes'], n, -1)
+  return gt
--- a/models-2.13.1/official/legacy/detection/utils/input_utils.py
+++ b/models-2.13.1/official/legacy/detection/utils/input_utils.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for input processing."""
+import math
+import tensorflow as tf
+from official.legacy.detection.utils import box_utils
+from official.vision.utils.object_detection import preprocessor
+def pad_to_fixed_size(input_tensor, size, constant_values=0):
+  """Pads data to a fixed length at the first dimension.
+  Args:
+    input_tensor: `Tensor` with any dimension.
+    size: `int` number for the first dimension of output Tensor.
+    constant_values: `int` value assigned to the paddings.
+  Returns:
+    `Tensor` with the first dimension padded to `size`.
+  """
+  input_shape = input_tensor.get_shape().as_list()
+  padding_shape = []
+  # Computes the padding length on the first dimension.
+  padding_length = tf.maximum(0, size - tf.shape(input_tensor)[0])
+  assert_length = tf.Assert(
+      tf.greater_equal(padding_length, 0), [padding_length])
+  with tf.control_dependencies([assert_length]):
+    padding_shape.append(padding_length)
+  # Copies shapes of the rest of input shape dimensions.
+  for i in range(1, len(input_shape)):
+    padding_shape.append(tf.shape(input=input_tensor)[i])
+  # Pads input tensor to the fixed first dimension.
+  paddings = tf.cast(constant_values * tf.ones(padding_shape),
+                     input_tensor.dtype)
+  padded_tensor = tf.concat([input_tensor, paddings], axis=0)
+  output_shape = input_shape
+  output_shape[0] = size
+  padded_tensor.set_shape(output_shape)
+  return padded_tensor
+def normalize_image(image,
+                    offset=(0.485, 0.456, 0.406),
+                    scale=(0.229, 0.224, 0.225)):
+  """Normalizes the image to zero mean and unit variance."""
+  image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+  offset = tf.constant(offset)
+  offset = tf.expand_dims(offset, axis=0)
+  offset = tf.expand_dims(offset, axis=0)
+  image -= offset
+  scale = tf.constant(scale)
+  scale = tf.expand_dims(scale, axis=0)
+  scale = tf.expand_dims(scale, axis=0)
+  image /= scale
+  return image
+def compute_padded_size(desired_size, stride):
+  """Compute the padded size given the desired size and the stride.
+  The padded size will be the smallest rectangle, such that each dimension is
+  the smallest multiple of the stride which is larger than the desired
+  dimension. For example, if desired_size = (100, 200) and stride = 32,
+  the output padded_size = (128, 224).
+  Args:
+    desired_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the target output image size.
+    stride: an integer, the stride of the backbone network.
+  Returns:
+    padded_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the padded output image size.
+  """
+  if isinstance(desired_size, list) or isinstance(desired_size, tuple):
+    padded_size = [
+        int(math.ceil(d * 1.0 / stride) * stride) for d in desired_size
+    ]
+  else:
+    padded_size = tf.cast(
+        tf.math.ceil(tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
+        tf.int32)
+  return padded_size
+def resize_and_crop_image(image,
+                          desired_size,
+                          padded_size,
+                          aug_scale_min=1.0,
+                          aug_scale_max=1.0,
+                          seed=1,
+                          method=tf.image.ResizeMethod.BILINEAR):
+  """Resizes the input image to output size.
+  Resize and pad images given the desired output size of the image and
+  stride size.
+  Here are the preprocessing steps.
+  1. For a given image, keep its aspect ratio and rescale the image to make it
+     the largest rectangle to be bounded by the rectangle specified by the
+     `desired_size`.
+  2. Pad the rescaled image to the padded_size.
+  Args:
+    image: a `Tensor` of shape [height, width, 3] representing an image.
+    desired_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the desired actual output image size.
+    padded_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the padded output image size. Padding will be applied
+      after scaling the image to the desired_size.
+    aug_scale_min: a `float` with range between [0, 1.0] representing minimum
+      random scale applied to desired_size for training scale jittering.
+    aug_scale_max: a `float` with range between [1.0, inf] representing maximum
+      random scale applied to desired_size for training scale jittering.
+    seed: seed for random scale jittering.
+    method: function to resize input image to scaled image.
+  Returns:
+    output_image: `Tensor` of shape [height, width, 3] where [height, width]
+      equals to `output_size`.
+    image_info: a 2D `Tensor` that encodes the information of the image and the
+      applied preprocessing. It is in the format of
+      [[original_height, original_width], [desired_height, desired_width],
+       [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
+      desireed_width] is the actual scaled image size, and [y_scale, x_scale] is
+      the scaling factory, which is the ratio of
+      scaled dimension / original dimension.
+  """
+  with tf.name_scope('resize_and_crop_image'):
+    image_size = tf.cast(tf.shape(input=image)[0:2], tf.float32)
+    random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
+    if random_jittering:
+      random_scale = tf.random.uniform([],
+                                       aug_scale_min,
+                                       aug_scale_max,
+                                       seed=seed)
+      scaled_size = tf.round(random_scale * desired_size)
+    else:
+      scaled_size = desired_size
+    scale = tf.minimum(scaled_size[0] / image_size[0],
+                       scaled_size[1] / image_size[1])
+    scaled_size = tf.round(image_size * scale)
+    # Computes 2D image_scale.
+    image_scale = scaled_size / image_size
+    # Selects non-zero random offset (x, y) if scaled image is larger than
+    # desired_size.
+    if random_jittering:
+      max_offset = scaled_size - desired_size
+      max_offset = tf.where(
+          tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
+      offset = max_offset * tf.random.uniform([
+          2,
+      ], 0, 1, seed=seed)
+      offset = tf.cast(offset, tf.int32)
+    else:
+      offset = tf.zeros((2,), tf.int32)
+    scaled_image = tf.image.resize(
+        image, tf.cast(scaled_size, tf.int32), method=method)
+    if random_jittering:
+      scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0],
+                                  offset[1]:offset[1] + desired_size[1], :]
+    output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
+                                                padded_size[0], padded_size[1])
+    image_info = tf.stack([
+        image_size,
+        tf.cast(desired_size, dtype=tf.float32), image_scale,
+        tf.cast(offset, tf.float32)
+    ])
+    return output_image, image_info
+def resize_and_crop_image_v2(image,
+                             short_side,
+                             long_side,
+                             padded_size,
+                             aug_scale_min=1.0,
+                             aug_scale_max=1.0,
+                             seed=1,
+                             method=tf.image.ResizeMethod.BILINEAR):
+  """Resizes the input image to output size (Faster R-CNN style).
+  Resize and pad images given the specified short / long side length and the
+  stride size.
+  Here are the preprocessing steps.
+  1. For a given image, keep its aspect ratio and first try to rescale the short
+     side of the original image to `short_side`.
+  2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
+     the aspect ratio and rescal the long side of the image to `long_side`.
+  2. Pad the rescaled image to the padded_size.
+  Args:
+    image: a `Tensor` of shape [height, width, 3] representing an image.
+    short_side: a scalar `Tensor` or `int` representing the desired short side
+      to be rescaled to.
+    long_side: a scalar `Tensor` or `int` representing the desired long side to
+      be rescaled to.
+    padded_size: a `Tensor` or `int` list/tuple of two elements representing
+      [height, width] of the padded output image size. Padding will be applied
+      after scaling the image to the desired_size.
+    aug_scale_min: a `float` with range between [0, 1.0] representing minimum
+      random scale applied to desired_size for training scale jittering.
+    aug_scale_max: a `float` with range between [1.0, inf] representing maximum
+      random scale applied to desired_size for training scale jittering.
+    seed: seed for random scale jittering.
+    method: function to resize input image to scaled image.
+  Returns:
+    output_image: `Tensor` of shape [height, width, 3] where [height, width]
+      equals to `output_size`.
+    image_info: a 2D `Tensor` that encodes the information of the image and the
+      applied preprocessing. It is in the format of
+      [[original_height, original_width], [desired_height, desired_width],
+       [y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
+      desired_width] is the actual scaled image size, and [y_scale, x_scale] is
+      the scaling factor, which is the ratio of
+      scaled dimension / original dimension.
+  """
+  with tf.name_scope('resize_and_crop_image_v2'):
+    image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
+    scale_using_short_side = (
+        short_side / tf.math.minimum(image_size[0], image_size[1]))
+    scale_using_long_side = (
+        long_side / tf.math.maximum(image_size[0], image_size[1]))
+    scaled_size = tf.math.round(image_size * scale_using_short_side)
+    scaled_size = tf.where(
+        tf.math.greater(
+            tf.math.maximum(scaled_size[0], scaled_size[1]), long_side),
+        tf.math.round(image_size * scale_using_long_side), scaled_size)
+    desired_size = scaled_size
+    random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
+    if random_jittering:
+      random_scale = tf.random.uniform([],
+                                       aug_scale_min,
+                                       aug_scale_max,
+                                       seed=seed)
+      scaled_size = tf.math.round(random_scale * scaled_size)
+    # Computes 2D image_scale.
+    image_scale = scaled_size / image_size
+    # Selects non-zero random offset (x, y) if scaled image is larger than
+    # desired_size.
+    if random_jittering:
+      max_offset = scaled_size - desired_size
+      max_offset = tf.where(
+          tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
+      offset = max_offset * tf.random.uniform([
+          2,
+      ], 0, 1, seed=seed)
+      offset = tf.cast(offset, tf.int32)
+    else:
+      offset = tf.zeros((2,), tf.int32)
+    scaled_image = tf.image.resize(
+        image, tf.cast(scaled_size, tf.int32), method=method)
+    if random_jittering:
+      scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0],
+                                  offset[1]:offset[1] + desired_size[1], :]
+    output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
+                                                padded_size[0], padded_size[1])
+    image_info = tf.stack([
+        image_size,
+        tf.cast(desired_size, dtype=tf.float32), image_scale,
+        tf.cast(offset, tf.float32)
+    ])
+    return output_image, image_info
+def resize_and_crop_boxes(boxes, image_scale, output_size, offset):
+  """Resizes boxes to output size with scale and offset.
+  Args:
+    boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
+    image_scale: 2D float `Tensor` representing scale factors that apply to
+      [height, width] of input image.
+    output_size: 2D `Tensor` or `int` representing [height, width] of target
+      output image size.
+    offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
+      boxes.
+  Returns:
+    boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
+  """
+  # Adjusts box coordinates based on image_scale and offset.
+  boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
+  boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
+  # Clips the boxes.
+  boxes = box_utils.clip_boxes(boxes, output_size)
+  return boxes
+def resize_and_crop_masks(masks, image_scale, output_size, offset):
+  """Resizes boxes to output size with scale and offset.
+  Args:
+    masks: `Tensor` of shape [N, H, W, 1] representing ground truth masks.
+    image_scale: 2D float `Tensor` representing scale factors that apply to
+      [height, width] of input image.
+    output_size: 2D `Tensor` or `int` representing [height, width] of target
+      output image size.
+    offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
+      boxes.
+  Returns:
+    masks: `Tensor` of shape [N, H, W, 1] representing the scaled masks.
+  """
+  mask_size = tf.shape(input=masks)[1:3]
+  scaled_size = tf.cast(image_scale * tf.cast(mask_size, image_scale.dtype),
+                        tf.int32)
+  scaled_masks = tf.image.resize(
+      masks, scaled_size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
+  offset = tf.cast(offset, tf.int32)
+  scaled_masks = scaled_masks[:, offset[0]:offset[0] + output_size[0],
+                              offset[1]:offset[1] + output_size[1], :]
+  output_masks = tf.image.pad_to_bounding_box(scaled_masks, 0, 0,
+                                              output_size[0], output_size[1])
+  return output_masks
+def random_horizontal_flip(image, boxes=None, masks=None):
+  """Randomly flips input image and bounding boxes."""
+  return preprocessor.random_horizontal_flip(image, boxes, masks)
--- a/models-2.13.1/official/legacy/detection/utils/mask_utils.py
+++ b/models-2.13.1/official/legacy/detection/utils/mask_utils.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for segmentations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import cv2
+import numpy as np
+def paste_instance_masks(masks, detected_boxes, image_height, image_width):
+  """Paste instance masks to generate the image segmentation results.
+  Args:
+    masks: a numpy array of shape [N, mask_height, mask_width] representing the
+      instance masks w.r.t. the `detected_boxes`.
+    detected_boxes: a numpy array of shape [N, 4] representing the reference
+      bounding boxes.
+    image_height: an integer representing the height of the image.
+    image_width: an integer representing the width of the image.
+  Returns:
+    segms: a numpy array of shape [N, image_height, image_width] representing
+      the instance masks *pasted* on the image canvas.
+  """
+  def expand_boxes(boxes, scale):
+    """Expands an array of boxes by a given scale."""
+    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227  # pylint: disable=line-too-long
+    # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
+    # whereas `boxes` here is in [x1, y1, w, h] form
+    w_half = boxes[:, 2] * .5
+    h_half = boxes[:, 3] * .5
+    x_c = boxes[:, 0] + w_half
+    y_c = boxes[:, 1] + h_half
+    w_half *= scale
+    h_half *= scale
+    boxes_exp = np.zeros(boxes.shape)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+    return boxes_exp
+  # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812  # pylint: disable=line-too-long
+  # To work around an issue with cv2.resize (it seems to automatically pad
+  # with repeated border values), we manually zero-pad the masks by 1 pixel
+  # prior to resizing back to the original image resolution. This prevents
+  # "top hat" artifacts. We therefore need to expand the reference boxes by an
+  # appropriate factor.
+  _, mask_height, mask_width = masks.shape
+  scale = max((mask_width + 2.0) / mask_width,
+              (mask_height + 2.0) / mask_height)
+  ref_boxes = expand_boxes(detected_boxes, scale)
+  ref_boxes = ref_boxes.astype(np.int32)
+  padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
+  segms = []
+  for mask_ind, mask in enumerate(masks):
+    im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
+    # Process mask inside bounding boxes.
+    padded_mask[1:-1, 1:-1] = mask[:, :]
+    ref_box = ref_boxes[mask_ind, :]
+    w = ref_box[2] - ref_box[0] + 1
+    h = ref_box[3] - ref_box[1] + 1
+    w = np.maximum(w, 1)
+    h = np.maximum(h, 1)
+    mask = cv2.resize(padded_mask, (w, h))
+    mask = np.array(mask > 0.5, dtype=np.uint8)
+    x_0 = min(max(ref_box[0], 0), image_width)
+    x_1 = min(max(ref_box[2] + 1, 0), image_width)
+    y_0 = min(max(ref_box[1], 0), image_height)
+    y_1 = min(max(ref_box[3] + 1, 0), image_height)
+    im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]),
+                                     (x_0 - ref_box[0]):(x_1 - ref_box[0])]
+    segms.append(im_mask)
+  segms = np.array(segms)
+  assert masks.shape[0] == segms.shape[0]
+  return segms
+def paste_instance_masks_v2(masks, detected_boxes, image_height, image_width):
+  """Paste instance masks to generate the image segmentation (v2).
+  Args:
+    masks: a numpy array of shape [N, mask_height, mask_width] representing the
+      instance masks w.r.t. the `detected_boxes`.
+    detected_boxes: a numpy array of shape [N, 4] representing the reference
+      bounding boxes.
+    image_height: an integer representing the height of the image.
+    image_width: an integer representing the width of the image.
+  Returns:
+    segms: a numpy array of shape [N, image_height, image_width] representing
+      the instance masks *pasted* on the image canvas.
+  """
+  _, mask_height, mask_width = masks.shape
+  segms = []
+  for i, mask in enumerate(masks):
+    box = detected_boxes[i, :]
+    xmin = box[0]
+    ymin = box[1]
+    xmax = xmin + box[2]
+    ymax = ymin + box[3]
+    # Sample points of the cropped mask w.r.t. the image grid.
+    # Note that these coordinates may fall beyond the image.
+    # Pixel clipping will happen after warping.
+    xmin_int = int(math.floor(xmin))
+    xmax_int = int(math.ceil(xmax))
+    ymin_int = int(math.floor(ymin))
+    ymax_int = int(math.ceil(ymax))
+    alpha = box[2] / (1.0 * mask_width)
+    beta = box[3] / (1.0 * mask_height)
+    # pylint: disable=invalid-name
+    # Transformation from mask pixel indices to image coordinate.
+    M_mask_to_image = np.array([[alpha, 0, xmin], [0, beta, ymin], [0, 0, 1]],
+                               dtype=np.float32)
+    # Transformation from image to cropped mask coordinate.
+    M_image_to_crop = np.array(
+        [[1, 0, -xmin_int], [0, 1, -ymin_int], [0, 0, 1]], dtype=np.float32)
+    M = np.dot(M_image_to_crop, M_mask_to_image)
+    # Compensate the half pixel offset that OpenCV has in the
+    # warpPerspective implementation: the top-left pixel is sampled
+    # at (0,0), but we want it to be at (0.5, 0.5).
+    M = np.dot(
+        np.dot(
+            np.array([[1, 0, -0.5], [0, 1, -0.5], [0, 0, 1]], np.float32), M),
+        np.array([[1, 0, 0.5], [0, 1, 0.5], [0, 0, 1]], np.float32))
+    # pylint: enable=invalid-name
+    cropped_mask = cv2.warpPerspective(
+        mask.astype(np.float32), M, (xmax_int - xmin_int, ymax_int - ymin_int))
+    cropped_mask = np.array(cropped_mask > 0.5, dtype=np.uint8)
+    img_mask = np.zeros((image_height, image_width))
+    x0 = max(min(xmin_int, image_width), 0)
+    x1 = max(min(xmax_int, image_width), 0)
+    y0 = max(min(ymin_int, image_height), 0)
+    y1 = max(min(ymax_int, image_height), 0)
+    img_mask[y0:y1, x0:x1] = cropped_mask[(y0 - ymin_int):(y1 - ymin_int),
+                                          (x0 - xmin_int):(x1 - xmin_int)]
+    segms.append(img_mask)
+  segms = np.array(segms)
+  return segms
--- a/models-2.13.1/official/legacy/image_classification/README.md
+++ b/models-2.13.1/official/legacy/image_classification/README.md
+# Image Classification
+**Warning:** the features in the `image_classification/` directory have been
+fully integrated into the [new code base](https://github.com/tensorflow/models/tree/benchmark/official/vision/modeling/backbones).
+This folder contains TF 2 model examples for image classification:
+* [MNIST](#mnist)
+* [Classifier Trainer](#classifier-trainer), a framework that uses the Keras
+compile/fit methods for image classification models, including:
+  * ResNet
+  * EfficientNet[^1]
+[^1]: Currently a work in progress. We cannot match "AutoAugment (AA)" in [the original version](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet).
+For more information about other types of models, please refer to this
+[README file](../../README.md).
+## Before you begin
+Please make sure that you have the latest version of TensorFlow
+installed and add the models folder to your Python path.
+### ImageNet preparation
+#### Using TFDS
+`classifier_trainer.py` supports ImageNet with
+[TensorFlow Datasets (TFDS)](https://www.tensorflow.org/datasets/overview).
+Please see the following [example snippet](https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/scripts/download_and_prepare.py)
+for more information on how to use TFDS to download and prepare datasets, and
+specifically the [TFDS ImageNet readme](https://github.com/tensorflow/datasets/blob/master/docs/catalog/imagenet2012.md)
+for manual download instructions.
+#### Legacy TFRecords
+Download the ImageNet dataset and convert it to TFRecord format.
+The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
+and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
+provide a few options.
+Note that the legacy ResNet runners, e.g. [resnet/resnet_ctl_imagenet_main.py](resnet/resnet_ctl_imagenet_main.py)
+require TFRecords whereas `classifier_trainer.py` can use both by setting the
+builder to 'records' or 'tfds' in the configurations.
+### Running on Cloud TPUs
+Note: These models will **not** work with TPUs on Colab.
+You can train image classification models on Cloud TPUs using
+[tf.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf.distribute.TPUStrategy?version=nightly).
+If you are not familiar with Cloud TPUs, it is strongly recommended that you go
+through the
+[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
+create a TPU and GCE VM.
+### Running on multiple GPU hosts
+You can also train these models on multiple hosts, each with GPUs, using
+[tf.distribute.experimental.MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy).
+The easiest way to run multi-host benchmarks is to set the
+[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
+appropriately at each host.  e.g., to run using `MultiWorkerMirroredStrategy` on
+2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
+host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
+"index": i}`.  `MultiWorkerMirroredStrategy` will automatically use all the
+available GPUs at each host.
+## MNIST
+To download the data and run the MNIST sample model locally for the first time,
+run one of the following command:
+<details>
+```bash
+python3 mnist_main.py \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --train_epochs=10 \
+  --distribution_strategy=one_device \
+  --num_gpus=$NUM_GPUS \
+  --download
+```
+</details>
+To train the model on a Cloud TPU, run the following command:
+<details>
+```bash
+python3 mnist_main.py \
+  --tpu=$TPU_NAME \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --train_epochs=10 \
+  --distribution_strategy=tpu \
+  --download
+```
+</details>
+Note: the `--download` flag is only required the first time you run the model.
+## Classifier Trainer
+The classifier trainer is a unified framework for running image classification
+models using Keras's compile/fit methods. Experiments should be provided in the
+form of YAML files, some examples are included within the configs/examples
+folder. Please see [configs/examples](./configs/examples) for more example
+configurations.
+The provided configuration files use a per replica batch size and is scaled
+by the number of devices. For instance, if `batch size` = 64, then for 1 GPU
+the global batch size would be 64 * 1 = 64. For 8 GPUs, the global batch size
+would be 64 * 8 = 512. Similarly, for a v3-8 TPU, the global batch size would
+be 64 * 8 = 512, and for a v3-32, the global batch size is 64 * 32 = 2048.
+### ResNet50
+#### On GPU:
+<details>
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=resnet \
+  --dataset=imagenet \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=configs/examples/resnet/imagenet/gpu.yaml \
+  --params_override='runtime.num_gpus=$NUM_GPUS'
+```
+</details>
+To train on multiple hosts, each with GPUs attached using
+[MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy)
+please update `runtime` section in gpu.yaml
+(or override using `--params_override`) with:
+<details>
+```YAML
+# gpu.yaml
+runtime:
+  distribution_strategy: 'multi_worker_mirrored'
+  worker_hosts: '$HOST1:port,$HOST2:port'
+  num_gpus: $NUM_GPUS
+  task_index: 0
+```
+</details>
+By having `task_index: 0` on the first host and `task_index: 1` on the second
+and so on. `$HOST1` and `$HOST2` are the IP addresses of the hosts, and `port`
+can be chosen any free port on the hosts. Only the first host will write
+TensorBoard Summaries and save checkpoints.
+#### On TPU:
+<details>
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=resnet \
+  --dataset=imagenet \
+  --tpu=$TPU_NAME \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=configs/examples/resnet/imagenet/tpu.yaml
+```
+</details>
+### VGG-16
+#### On GPU:
+<details>
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=vgg \
+  --dataset=imagenet \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=configs/examples/vgg/imagenet/gpu.yaml \
+  --params_override='runtime.num_gpus=$NUM_GPUS'
+```
+</details>
+### EfficientNet
+**Note: EfficientNet development is a work in progress.**
+#### On GPU:
+<details>
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=efficientnet \
+  --dataset=imagenet \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml \
+  --params_override='runtime.num_gpus=$NUM_GPUS'
+```
+</details>
+#### On TPU:
+<details>
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=efficientnet \
+  --dataset=imagenet \
+  --tpu=$TPU_NAME \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
+```
+</details>
+Note that the number of GPU devices can be overridden in the command line using
+`--params_overrides`. The TPU does not need this override as the device is fixed
+by providing the TPU address or name with the `--tpu` flag.
--- a/models-2.13.1/official/legacy/image_classification/__init__.py
+++ b/models-2.13.1/official/legacy/image_classification/__init__.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/models-2.13.1/official/legacy/image_classification/augment.py
+++ b/models-2.13.1/official/legacy/image_classification/augment.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AutoAugment and RandAugment policies for enhanced image preprocessing.
+AutoAugment Reference: https://arxiv.org/abs/1805.09501
+RandAugment Reference: https://arxiv.org/abs/1909.13719
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+from typing import Any, Dict, List, Optional, Text, Tuple
+import tensorflow as tf
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+def to_4d(image: tf.Tensor) -> tf.Tensor:
+  """Converts an input Tensor to 4 dimensions.
+  4D image => [N, H, W, C] or [N, C, H, W]
+  3D image => [1, H, W, C] or [1, C, H, W]
+  2D image => [1, H, W, 1]
+  Args:
+    image: The 2/3/4D input tensor.
+  Returns:
+    A 4D image tensor.
+  Raises:
+    `TypeError` if `image` is not a 2/3/4D tensor.
+  """
+  shape = tf.shape(image)
+  original_rank = tf.rank(image)
+  left_pad = tf.cast(tf.less_equal(original_rank, 3), dtype=tf.int32)
+  right_pad = tf.cast(tf.equal(original_rank, 2), dtype=tf.int32)
+  new_shape = tf.concat(
+      [
+          tf.ones(shape=left_pad, dtype=tf.int32),
+          shape,
+          tf.ones(shape=right_pad, dtype=tf.int32),
+      ],
+      axis=0,
+  )
+  return tf.reshape(image, new_shape)
+def from_4d(image: tf.Tensor, ndims: tf.Tensor) -> tf.Tensor:
+  """Converts a 4D image back to `ndims` rank."""
+  shape = tf.shape(image)
+  begin = tf.cast(tf.less_equal(ndims, 3), dtype=tf.int32)
+  end = 4 - tf.cast(tf.equal(ndims, 2), dtype=tf.int32)
+  new_shape = shape[begin:end]
+  return tf.reshape(image, new_shape)
+def _convert_translation_to_transform(translations: tf.Tensor) -> tf.Tensor:
+  """Converts translations to a projective transform.
+  The translation matrix looks like this:
+    [[1 0 -dx]
+     [0 1 -dy]
+     [0 0 1]]
+  Args:
+    translations: The 2-element list representing [dx, dy], or a matrix of
+      2-element lists representing [dx dy] to translate for each image. The
+      shape must be static.
+  Returns:
+    The transformation matrix of shape (num_images, 8).
+  Raises:
+    `TypeError` if
+      - the shape of `translations` is not known or
+      - the shape of `translations` is not rank 1 or 2.
+  """
+  translations = tf.convert_to_tensor(translations, dtype=tf.float32)
+  if translations.get_shape().ndims is None:
+    raise TypeError('translations rank must be statically known')
+  elif len(translations.get_shape()) == 1:
+    translations = translations[None]
+  elif len(translations.get_shape()) != 2:
+    raise TypeError('translations should have rank 1 or 2.')
+  num_translations = tf.shape(translations)[0]
+  return tf.concat(
+      values=[
+          tf.ones((num_translations, 1), tf.dtypes.float32),
+          tf.zeros((num_translations, 1), tf.dtypes.float32),
+          -translations[:, 0, None],
+          tf.zeros((num_translations, 1), tf.dtypes.float32),
+          tf.ones((num_translations, 1), tf.dtypes.float32),
+          -translations[:, 1, None],
+          tf.zeros((num_translations, 2), tf.dtypes.float32),
+      ],
+      axis=1,
+  )
+def _convert_angles_to_transform(angles: tf.Tensor, image_width: tf.Tensor,
+                                 image_height: tf.Tensor) -> tf.Tensor:
+  """Converts an angle or angles to a projective transform.
+  Args:
+    angles: A scalar to rotate all images, or a vector to rotate a batch of
+      images. This must be a scalar.
+    image_width: The width of the image(s) to be transformed.
+    image_height: The height of the image(s) to be transformed.
+  Returns:
+    A tensor of shape (num_images, 8).
+  Raises:
+    `TypeError` if `angles` is not rank 0 or 1.
+  """
+  angles = tf.convert_to_tensor(angles, dtype=tf.float32)
+  if len(angles.get_shape()) == 0:  # pylint:disable=g-explicit-length-test
+    angles = angles[None]
+  elif len(angles.get_shape()) != 1:
+    raise TypeError('Angles should have a rank 0 or 1.')
+  x_offset = ((image_width - 1) -
+              (tf.math.cos(angles) * (image_width - 1) - tf.math.sin(angles) *
+               (image_height - 1))) / 2.0
+  y_offset = ((image_height - 1) -
+              (tf.math.sin(angles) * (image_width - 1) + tf.math.cos(angles) *
+               (image_height - 1))) / 2.0
+  num_angles = tf.shape(angles)[0]
+  return tf.concat(
+      values=[
+          tf.math.cos(angles)[:, None],
+          -tf.math.sin(angles)[:, None],
+          x_offset[:, None],
+          tf.math.sin(angles)[:, None],
+          tf.math.cos(angles)[:, None],
+          y_offset[:, None],
+          tf.zeros((num_angles, 2), tf.dtypes.float32),
+      ],
+      axis=1,
+  )
+def apply_transform_to_images(
+    images,
+    transforms,
+    fill_mode='reflect',
+    fill_value=0.0,
+    interpolation='bilinear',
+    output_shape=None,
+    name=None,
+):
+  """Applies the given transform(s) to the image(s).
+  Args:
+    images: A tensor of shape `(num_images, num_rows, num_columns,
+      num_channels)` (NHWC). The rank must be statically known (the shape is
+      not `TensorShape(None)`).
+    transforms: Projective transform matrix/matrices. A vector of length 8 or
+      tensor of size N x 8. If one row of transforms is [a0, a1, a2, b0, b1,
+      b2, c0, c1], then it maps the *output* point `(x, y)` to a transformed
+      *input* point `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) /
+      k)`, where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared
+      to the transform mapping input points to output points. Note that
+      gradients are not backpropagated into transformation parameters.
+    fill_mode: Points outside the boundaries of the input are filled according
+      to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`).
+    fill_value: a float represents the value to be filled outside the
+      boundaries when `fill_mode="constant"`.
+    interpolation: Interpolation mode. Supported values: `"nearest"`,
+      `"bilinear"`.
+    output_shape: Output dimension after the transform, `[height, width]`. If
+      `None`, output is the same size as input image.
+    name: The name of the op.  Fill mode behavior for each valid value is as
+      follows
+      - `"reflect"`: `(d c b a | a b c d | d c b a)` The input is extended by
+      reflecting about the edge of the last pixel.
+      - `"constant"`: `(k k k k | a b c d | k k k k)` The input is extended by
+      filling all values beyond the edge with the same constant value k = 0.
+      - `"wrap"`: `(a b c d | a b c d | a b c d)` The input is extended by
+      wrapping around to the opposite edge.
+      - `"nearest"`: `(a a a a | a b c d | d d d d)` The input is extended by
+      the nearest pixel.  Input shape: 4D tensor with shape:
+      `(samples, height, width, channels)`, in `"channels_last"` format.
+      Output shape: 4D tensor with shape: `(samples, height, width, channels)`,
+      in `"channels_last"` format.
+  Returns:
+    Image(s) with the same type and shape as `images`, with the given
+    transform(s) applied. Transformed coordinates outside of the input image
+    will be filled with zeros.
+  """
+  with tf.name_scope(name or 'transform'):
+    if output_shape is None:
+      output_shape = tf.shape(images)[1:3]
+      if not tf.executing_eagerly():
+        output_shape_value = tf.get_static_value(output_shape)
+        if output_shape_value is not None:
+          output_shape = output_shape_value
+    output_shape = tf.convert_to_tensor(
+        output_shape, tf.int32, name='output_shape'
+    )
+    if not output_shape.get_shape().is_compatible_with([2]):
+      raise ValueError(
+          'output_shape must be a 1-D Tensor of 2 elements: '
+          'new_height, new_width, instead got '
+          f'output_shape={output_shape}'
+      )
+    fill_value = tf.convert_to_tensor(fill_value, tf.float32, name='fill_value')
+    return tf.raw_ops.ImageProjectiveTransformV3(
+        images=images,
+        output_shape=output_shape,
+        fill_value=fill_value,
+        transforms=transforms,
+        fill_mode=fill_mode.upper(),
+        interpolation=interpolation.upper(),
+    )
+def transform(image: tf.Tensor, transforms) -> tf.Tensor:
+  """Prepares input data for `image_ops.transform`."""
+  original_ndims = tf.rank(image)
+  transforms = tf.convert_to_tensor(transforms, dtype=tf.float32)
+  if transforms.shape.rank == 1:
+    transforms = transforms[None]
+  image = to_4d(image)
+  image = apply_transform_to_images(
+      images=image, transforms=transforms, interpolation='nearest'
+  )
+  return from_4d(image, original_ndims)
+def translate(image: tf.Tensor, translations) -> tf.Tensor:
+  """Translates image(s) by provided vectors.
+  Args:
+    image: An image Tensor of type uint8.
+    translations: A vector or matrix representing [dx dy].
+  Returns:
+    The translated version of the image.
+  """
+  transforms = _convert_translation_to_transform(translations)  # pytype: disable=wrong-arg-types  # always-use-return-annotations
+  return transform(image, transforms=transforms)
+def rotate(image: tf.Tensor, degrees: float) -> tf.Tensor:
+  """Rotates the image by degrees either clockwise or counterclockwise.
+  Args:
+    image: An image Tensor of type uint8.
+    degrees: Float, a scalar angle in degrees to rotate all images by. If
+      degrees is positive the image will be rotated clockwise otherwise it will
+      be rotated counterclockwise.
+  Returns:
+    The rotated version of image.
+  """
+  # Convert from degrees to radians.
+  degrees_to_radians = math.pi / 180.0
+  radians = tf.cast(degrees * degrees_to_radians, tf.float32)
+  original_ndims = tf.rank(image)
+  image = to_4d(image)
+  image_height = tf.cast(tf.shape(image)[1], tf.float32)
+  image_width = tf.cast(tf.shape(image)[2], tf.float32)
+  transforms = _convert_angles_to_transform(
+      angles=radians, image_width=image_width, image_height=image_height)
+  # In practice, we should randomize the rotation degrees by flipping
+  # it negatively half the time, but that's done on 'degrees' outside
+  # of the function.
+  image = transform(image, transforms=transforms)
+  return from_4d(image, original_ndims)
+def blend(image1: tf.Tensor, image2: tf.Tensor, factor: float) -> tf.Tensor:
+  """Blend image1 and image2 using 'factor'.
+  Factor can be above 0.0.  A value of 0.0 means only image1 is used.
+  A value of 1.0 means only image2 is used.  A value between 0.0 and
+  1.0 means we linearly interpolate the pixel values between the two
+  images.  A value greater than 1.0 "extrapolates" the difference
+  between the two pixel values, and we clip the results to values
+  between 0 and 255.
+  Args:
+    image1: An image Tensor of type uint8.
+    image2: An image Tensor of type uint8.
+    factor: A floating point value above 0.0.
+  Returns:
+    A blended image Tensor of type uint8.
+  """
+  if factor == 0.0:
+    return tf.convert_to_tensor(image1)
+  if factor == 1.0:
+    return tf.convert_to_tensor(image2)
+  image1 = tf.cast(image1, tf.float32)
+  image2 = tf.cast(image2, tf.float32)
+  difference = image2 - image1
+  scaled = factor * difference
+  # Do addition in float.
+  temp = tf.cast(image1, tf.float32) + scaled
+  # Interpolate
+  if factor > 0.0 and factor < 1.0:
+    # Interpolation means we always stay within 0 and 255.
+    return tf.cast(temp, tf.uint8)
+  # Extrapolate:
+  #
+  # We need to clip and then cast.
+  return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8)
+def cutout(image: tf.Tensor, pad_size: int, replace: int = 0) -> tf.Tensor:
+  """Apply cutout (https://arxiv.org/abs/1708.04552) to image.
+  This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
+  a random location within `img`. The pixel values filled in will be of the
+  value `replace`. The located where the mask will be applied is randomly
+  chosen uniformly over the whole image.
+  Args:
+    image: An image Tensor of type uint8.
+    pad_size: Specifies how big the zero mask that will be generated is that is
+      applied to the image. The mask will be of size (2*pad_size x 2*pad_size).
+    replace: What pixel value to fill in the image in the area that has the
+      cutout mask applied to it.
+  Returns:
+    An image Tensor that is of type uint8.
+  """
+  image_height = tf.shape(image)[0]
+  image_width = tf.shape(image)[1]
+  # Sample the center location in the image where the zero mask will be applied.
+  cutout_center_height = tf.random.uniform(
+      shape=[], minval=0, maxval=image_height, dtype=tf.int32)
+  cutout_center_width = tf.random.uniform(
+      shape=[], minval=0, maxval=image_width, dtype=tf.int32)
+  lower_pad = tf.maximum(0, cutout_center_height - pad_size)
+  upper_pad = tf.maximum(0, image_height - cutout_center_height - pad_size)
+  left_pad = tf.maximum(0, cutout_center_width - pad_size)
+  right_pad = tf.maximum(0, image_width - cutout_center_width - pad_size)
+  cutout_shape = [
+      image_height - (lower_pad + upper_pad),
+      image_width - (left_pad + right_pad)
+  ]
+  padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+  mask = tf.pad(
+      tf.zeros(cutout_shape, dtype=image.dtype),
+      padding_dims,
+      constant_values=1)
+  mask = tf.expand_dims(mask, -1)
+  mask = tf.tile(mask, [1, 1, 3])
+  image = tf.where(
+      tf.equal(mask, 0),
+      tf.ones_like(image, dtype=image.dtype) * replace, image)
+  return image
+def solarize(image: tf.Tensor, threshold: int = 128) -> tf.Tensor:
+  # For each pixel in the image, select the pixel
+  # if the value is less than the threshold.
+  # Otherwise, subtract 255 from the pixel.
+  return tf.where(image < threshold, image, 255 - image)
+def solarize_add(image: tf.Tensor,
+                 addition: int = 0,
+                 threshold: int = 128) -> tf.Tensor:
+  # For each pixel in the image less than threshold
+  # we add 'addition' amount to it and then clip the
+  # pixel value to be between 0 and 255. The value
+  # of 'addition' is between -128 and 128.
+  added_image = tf.cast(image, tf.int64) + addition
+  added_image = tf.cast(tf.clip_by_value(added_image, 0, 255), tf.uint8)
+  return tf.where(image < threshold, added_image, image)
+def color(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Color."""
+  degenerate = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image))
+  return blend(degenerate, image, factor)
+def contrast(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Contrast."""
+  degenerate = tf.image.rgb_to_grayscale(image)
+  # Cast before calling tf.histogram.
+  degenerate = tf.cast(degenerate, tf.int32)
+  # Compute the grayscale histogram, then compute the mean pixel value,
+  # and create a constant image size of that value.  Use that as the
+  # blending degenerate target of the original image.
+  hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256)
+  mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0
+  degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8))
+  return blend(degenerate, image, factor)
+def brightness(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Brightness."""
+  degenerate = tf.zeros_like(image)
+  return blend(degenerate, image, factor)
+def posterize(image: tf.Tensor, bits: int) -> tf.Tensor:
+  """Equivalent of PIL Posterize."""
+  shift = 8 - bits
+  return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift)
+def wrapped_rotate(image: tf.Tensor, degrees: float, replace: int) -> tf.Tensor:
+  """Applies rotation with wrap/unwrap."""
+  image = rotate(wrap(image), degrees=degrees)
+  return unwrap(image, replace)
+def translate_x(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Translate in X dimension."""
+  image = translate(wrap(image), [-pixels, 0])
+  return unwrap(image, replace)
+def translate_y(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Translate in Y dimension."""
+  image = translate(wrap(image), [0, -pixels])
+  return unwrap(image, replace)
+def shear_x(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Shearing in X dimension."""
+  # Shear parallel to x axis is a projective transform
+  # with a matrix form of:
+  # [1  level
+  #  0  1].
+  image = transform(
+      image=wrap(image), transforms=[1., level, 0., 0., 1., 0., 0., 0.])
+  return unwrap(image, replace)
+def shear_y(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Shearing in Y dimension."""
+  # Shear parallel to y axis is a projective transform
+  # with a matrix form of:
+  # [1  0
+  #  level  1].
+  image = transform(
+      image=wrap(image), transforms=[1., 0., 0., level, 1., 0., 0., 0.])
+  return unwrap(image, replace)
+def autocontrast(image: tf.Tensor) -> tf.Tensor:
+  """Implements Autocontrast function from PIL using TF ops.
+  Args:
+    image: A 3D uint8 tensor.
+  Returns:
+    The image after it has had autocontrast applied to it and will be of type
+    uint8.
+  """
+  def scale_channel(image: tf.Tensor) -> tf.Tensor:
+    """Scale the 2D image using the autocontrast rule."""
+    # A possibly cheaper version can be done using cumsum/unique_with_counts
+    # over the histogram values, rather than iterating over the entire image.
+    # to compute mins and maxes.
+    lo = tf.cast(tf.reduce_min(image), tf.float32)
+    hi = tf.cast(tf.reduce_max(image), tf.float32)
+    # Scale the image, making the lowest value 0 and the highest value 255.
+    def scale_values(im):
+      scale = 255.0 / (hi - lo)
+      offset = -lo * scale
+      im = tf.cast(im, tf.float32) * scale + offset
+      im = tf.clip_by_value(im, 0.0, 255.0)
+      return tf.cast(im, tf.uint8)
+    result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image)
+    return result
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image[:, :, 0])
+  s2 = scale_channel(image[:, :, 1])
+  s3 = scale_channel(image[:, :, 2])
+  image = tf.stack([s1, s2, s3], 2)
+  return image
+def sharpness(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Implements Sharpness function from PIL using TF ops."""
+  orig_image = image
+  image = tf.cast(image, tf.float32)
+  # Make image 4D for conv operation.
+  image = tf.expand_dims(image, 0)
+  # SMOOTH PIL Kernel.
+  kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]],
+                       dtype=tf.float32,
+                       shape=[3, 3, 1, 1]) / 13.
+  # Tile across channel dimension.
+  kernel = tf.tile(kernel, [1, 1, 3, 1])
+  strides = [1, 1, 1, 1]
+  degenerate = tf.nn.depthwise_conv2d(
+      image, kernel, strides, padding='VALID', dilations=[1, 1])
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0])
+  # For the borders of the resulting image, fill in the values of the
+  # original image.
+  mask = tf.ones_like(degenerate)
+  padded_mask = tf.pad(mask, [[1, 1], [1, 1], [0, 0]])
+  padded_degenerate = tf.pad(degenerate, [[1, 1], [1, 1], [0, 0]])
+  result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image)
+  # Blend the final result.
+  return blend(result, orig_image, factor)
+def equalize(image: tf.Tensor) -> tf.Tensor:
+  """Implements Equalize function from PIL using TF ops."""
+  def scale_channel(im, c):
+    """Scale the data in the channel to implement equalize."""
+    im = tf.cast(im[:, :, c], tf.int32)
+    # Compute the histogram of the image channel.
+    histo = tf.histogram_fixed_width(im, [0, 255], nbins=256)
+    # For the purposes of computing the step, filter out the nonzeros.
+    nonzero = tf.where(tf.not_equal(histo, 0))
+    nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1])
+    step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255
+    def build_lut(histo, step):
+      # Compute the cumulative sum, shifting by step // 2
+      # and then normalization by step.
+      lut = (tf.cumsum(histo) + (step // 2)) // step
+      # Shift lut, prepending with 0.
+      lut = tf.concat([[0], lut[:-1]], 0)
+      # Clip the counts to be in range.  This is done
+      # in the C code for image.point.
+      return tf.clip_by_value(lut, 0, 255)
+    # If step is zero, return the original image.  Otherwise, build
+    # lut from the full histogram and step and then index from it.
+    result = tf.cond(
+        tf.equal(step, 0), lambda: im,
+        lambda: tf.gather(build_lut(histo, step), im))
+    return tf.cast(result, tf.uint8)
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image, 0)
+  s2 = scale_channel(image, 1)
+  s3 = scale_channel(image, 2)
+  image = tf.stack([s1, s2, s3], 2)
+  return image
+def invert(image: tf.Tensor) -> tf.Tensor:
+  """Inverts the image pixels."""
+  image = tf.convert_to_tensor(image)
+  return 255 - image
+def wrap(image: tf.Tensor) -> tf.Tensor:
+  """Returns 'image' with an extra channel set to all 1s."""
+  shape = tf.shape(image)
+  extended_channel = tf.ones([shape[0], shape[1], 1], image.dtype)
+  extended = tf.concat([image, extended_channel], axis=2)
+  return extended
+def unwrap(image: tf.Tensor, replace: int) -> tf.Tensor:
+  """Unwraps an image produced by wrap.
+  Where there is a 0 in the last channel for every spatial position,
+  the rest of the three channels in that spatial dimension are grayed
+  (set to 128).  Operations like translate and shear on a wrapped
+  Tensor will leave 0s in empty locations.  Some transformations look
+  at the intensity of values to do preprocessing, and we want these
+  empty pixels to assume the 'average' value, rather than pure black.
+  Args:
+    image: A 3D Image Tensor with 4 channels.
+    replace: A one or three value 1D tensor to fill empty pixels.
+  Returns:
+    image: A 3D image Tensor with 3 channels.
+  """
+  image_shape = tf.shape(image)
+  # Flatten the spatial dimensions.
+  flattened_image = tf.reshape(image, [-1, image_shape[2]])
+  # Find all pixels where the last channel is zero.
+  alpha_channel = tf.expand_dims(flattened_image[:, 3], axis=-1)
+  replace = tf.concat([replace, tf.ones([1], image.dtype)], 0)
+  # Where they are zero, fill them in with 'replace'.
+  flattened_image = tf.where(
+      tf.equal(alpha_channel, 0),
+      tf.ones_like(flattened_image, dtype=image.dtype) * replace,
+      flattened_image)
+  image = tf.reshape(flattened_image, image_shape)
+  image = tf.slice(image, [0, 0, 0], [image_shape[0], image_shape[1], 3])
+  return image
+def _randomly_negate_tensor(tensor):
+  """With 50% prob turn the tensor negative."""
+  should_flip = tf.cast(tf.floor(tf.random.uniform([]) + 0.5), tf.bool)
+  final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor)
+  return final_tensor
+def _rotate_level_to_arg(level: float):
+  level = (level / _MAX_LEVEL) * 30.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+def _shrink_level_to_arg(level: float):
+  """Converts level to ratio by which we shrink the image content."""
+  if level == 0:
+    return (1.0,)  # if level is zero, do not shrink the image
+  # Maximum shrinking ratio is 2.9.
+  level = 2. / (_MAX_LEVEL / level) + 0.9
+  return (level,)
+def _enhance_level_to_arg(level: float):
+  return ((level / _MAX_LEVEL) * 1.8 + 0.1,)
+def _shear_level_to_arg(level: float):
+  level = (level / _MAX_LEVEL) * 0.3
+  # Flip level to negative with 50% chance.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+def _translate_level_to_arg(level: float, translate_const: float):
+  level = (level / _MAX_LEVEL) * float(translate_const)
+  # Flip level to negative with 50% chance.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+def _mult_to_arg(level: float, multiplier: float = 1.):
+  return (int((level / _MAX_LEVEL) * multiplier),)
+def _apply_func_with_prob(func: Any, image: tf.Tensor, args: Any, prob: float):
+  """Apply `func` to image w/ `args` as input with probability `prob`."""
+  assert isinstance(args, tuple)
+  # Apply the function with probability `prob`.
+  should_apply_op = tf.cast(
+      tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool)
+  augmented_image = tf.cond(should_apply_op, lambda: func(image, *args),
+                            lambda: image)
+  return augmented_image
+def select_and_apply_random_policy(policies: Any, image: tf.Tensor):
+  """Select a random policy from `policies` and apply it to `image`."""
+  policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32)
+  # Note that using tf.case instead of tf.conds would result in significantly
+  # larger graphs and would even break export for some larger policies.
+  for (i, policy) in enumerate(policies):
+    image = tf.cond(
+        tf.equal(i, policy_to_select),
+        lambda selected_policy=policy: selected_policy(image),
+        lambda: image)
+  return image
+NAME_TO_FUNC = {
+    'AutoContrast': autocontrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': wrapped_rotate,
+    'Posterize': posterize,
+    'Solarize': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'Contrast': contrast,
+    'Brightness': brightness,
+    'Sharpness': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x,
+    'TranslateY': translate_y,
+    'Cutout': cutout,
+}
+# Functions that have a 'replace' parameter
+REPLACE_FUNCS = frozenset({
+    'Rotate',
+    'TranslateX',
+    'ShearX',
+    'ShearY',
+    'TranslateY',
+    'Cutout',
+})
+def level_to_arg(cutout_const: float, translate_const: float):
+  """Creates a dict mapping image operation names to their arguments."""
+  no_arg = lambda level: ()
+  posterize_arg = lambda level: _mult_to_arg(level, 4)
+  solarize_arg = lambda level: _mult_to_arg(level, 256)
+  solarize_add_arg = lambda level: _mult_to_arg(level, 110)
+  cutout_arg = lambda level: _mult_to_arg(level, cutout_const)
+  translate_arg = lambda level: _translate_level_to_arg(level, translate_const)
+  args = {
+      'AutoContrast': no_arg,
+      'Equalize': no_arg,
+      'Invert': no_arg,
+      'Rotate': _rotate_level_to_arg,
+      'Posterize': posterize_arg,
+      'Solarize': solarize_arg,
+      'SolarizeAdd': solarize_add_arg,
+      'Color': _enhance_level_to_arg,
+      'Contrast': _enhance_level_to_arg,
+      'Brightness': _enhance_level_to_arg,
+      'Sharpness': _enhance_level_to_arg,
+      'ShearX': _shear_level_to_arg,
+      'ShearY': _shear_level_to_arg,
+      'Cutout': cutout_arg,
+      'TranslateX': translate_arg,
+      'TranslateY': translate_arg,
+  }
+  return args
+def _parse_policy_info(name: Text, prob: float, level: float,
+                       replace_value: List[int], cutout_const: float,
+                       translate_const: float) -> Tuple[Any, float, Any]:
+  """Return the function that corresponds to `name` and update `level` param."""
+  func = NAME_TO_FUNC[name]
+  args = level_to_arg(cutout_const, translate_const)[name](level)
+  if name in REPLACE_FUNCS:
+    # Add in replace arg if it is required for the function that is called.
+    args = tuple(list(args) + [replace_value])
+  return func, prob, args
+class ImageAugment(object):
+  """Image augmentation class for applying image distortions."""
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """Given an image tensor, returns a distorted image with the same shape.
+    Args:
+      image: `Tensor` of shape [height, width, 3] representing an image.
+    Returns:
+      The augmented version of `image`.
+    """
+    raise NotImplementedError()
+class AutoAugment(ImageAugment):
+  """Applies the AutoAugment policy to images.
+    AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
+  """
+  def __init__(self,
+               augmentation_name: Text = 'v0',
+               policies: Optional[Dict[Text, Any]] = None,
+               cutout_const: float = 100,
+               translate_const: float = 250):
+    """Applies the AutoAugment policy to images.
+    Args:
+      augmentation_name: The name of the AutoAugment policy to use. The
+        available options are `v0` and `test`. `v0` is the policy used for all
+        of the results in the paper and was found to achieve the best results on
+        the COCO dataset. `v1`, `v2` and `v3` are additional good policies found
+        on the COCO dataset that have slight variation in what operations were
+        used during the search procedure along with how many operations are
+        applied in parallel to a single image (2 vs 3).
+      policies: list of lists of tuples in the form `(func, prob, level)`,
+        `func` is a string name of the augmentation function, `prob` is the
+        probability of applying the `func` operation, `level` is the input
+        argument for `func`.
+      cutout_const: multiplier for applying cutout.
+      translate_const: multiplier for applying translation.
+    """
+    super(AutoAugment, self).__init__()
+    if policies is None:
+      self.available_policies = {
+          'v0': self.policy_v0(),
+          'test': self.policy_test(),
+          'simple': self.policy_simple(),
+      }
+    if augmentation_name not in self.available_policies:
+      raise ValueError(
+          'Invalid augmentation_name: {}'.format(augmentation_name))
+    self.augmentation_name = augmentation_name
+    self.policies = self.available_policies[augmentation_name]
+    self.cutout_const = float(cutout_const)
+    self.translate_const = float(translate_const)
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """Applies the AutoAugment policy to `image`.
+    AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
+    Args:
+      image: `Tensor` of shape [height, width, 3] representing an image.
+    Returns:
+      A version of image that now has data augmentation applied to it based on
+      the `policies` pass into the function.
+    """
+    input_image_type = image.dtype
+    if input_image_type != tf.uint8:
+      image = tf.clip_by_value(image, 0.0, 255.0)
+      image = tf.cast(image, dtype=tf.uint8)
+    replace_value = [128] * 3
+    # func is the string name of the augmentation function, prob is the
+    # probability of applying the operation and level is the parameter
+    # associated with the tf op.
+    # tf_policies are functions that take in an image and return an augmented
+    # image.
+    tf_policies = []
+    for policy in self.policies:
+      tf_policy = []
+      # Link string name to the correct python function and make sure the
+      # correct argument is passed into that function.
+      for policy_info in policy:
+        policy_info = list(policy_info) + [
+            replace_value, self.cutout_const, self.translate_const
+        ]
+        tf_policy.append(_parse_policy_info(*policy_info))
+      # Now build the tf policy that will apply the augmentation procedue
+      # on image.
+      def make_final_policy(tf_policy_):
+        def final_policy(image_):
+          for func, prob, args in tf_policy_:
+            image_ = _apply_func_with_prob(func, image_, args, prob)
+          return image_
+        return final_policy
+      tf_policies.append(make_final_policy(tf_policy))
+    image = select_and_apply_random_policy(tf_policies, image)
+    image = tf.cast(image, dtype=input_image_type)
+    return image
+  @staticmethod
+  def policy_v0():
+    """Autoaugment policy that was used in AutoAugment Paper.
+    Each tuple is an augmentation operation of the form
+    (operation, probability, magnitude). Each element in policy is a
+    sub-policy that will be applied sequentially on the image.
+    Returns:
+      the policy.
+    """
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    return policy
+  @staticmethod
+  def policy_simple():
+    """Same as `policy_v0`, except with custom ops removed."""
+    policy = [
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+    ]
+    return policy
+  @staticmethod
+  def policy_test():
+    """Autoaugment test policy for debugging."""
+    policy = [
+        [('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)],
+    ]
+    return policy
+class RandAugment(ImageAugment):
+  """Applies the RandAugment policy to images.
+  RandAugment is from the paper https://arxiv.org/abs/1909.13719,
+  """
+  def __init__(self,
+               num_layers: int = 2,
+               magnitude: float = 10.,
+               cutout_const: float = 40.,
+               translate_const: float = 100.):
+    """Applies the RandAugment policy to images.
+    Args:
+      num_layers: Integer, the number of augmentation transformations to apply
+        sequentially to an image. Represented as (N) in the paper. Usually best
+        values will be in the range [1, 3].
+      magnitude: Integer, shared magnitude across all augmentation operations.
+        Represented as (M) in the paper. Usually best values are in the range
+        [5, 10].
+      cutout_const: multiplier for applying cutout.
+      translate_const: multiplier for applying translation.
+    """
+    super(RandAugment, self).__init__()
+    self.num_layers = num_layers
+    self.magnitude = float(magnitude)
+    self.cutout_const = float(cutout_const)
+    self.translate_const = float(translate_const)
+    self.available_ops = [
+        'AutoContrast', 'Equalize', 'Invert', 'Rotate', 'Posterize', 'Solarize',
+        'Color', 'Contrast', 'Brightness', 'Sharpness', 'ShearX', 'ShearY',
+        'TranslateX', 'TranslateY', 'Cutout', 'SolarizeAdd'
+    ]
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """Applies the RandAugment policy to `image`.
+    Args:
+      image: `Tensor` of shape [height, width, 3] representing an image.
+    Returns:
+      The augmented version of `image`.
+    """
+    input_image_type = image.dtype
+    if input_image_type != tf.uint8:
+      image = tf.clip_by_value(image, 0.0, 255.0)
+      image = tf.cast(image, dtype=tf.uint8)
+    replace_value = [128] * 3
+    min_prob, max_prob = 0.2, 0.8
+    for _ in range(self.num_layers):
+      op_to_select = tf.random.uniform([],
+                                       maxval=len(self.available_ops) + 1,
+                                       dtype=tf.int32)
+      branch_fns = []
+      for (i, op_name) in enumerate(self.available_ops):
+        prob = tf.random.uniform([],
+                                 minval=min_prob,
+                                 maxval=max_prob,
+                                 dtype=tf.float32)
+        func, _, args = _parse_policy_info(op_name, prob, self.magnitude,
+                                           replace_value, self.cutout_const,
+                                           self.translate_const)
+        branch_fns.append((
+            i,
+            # pylint:disable=g-long-lambda
+            lambda selected_func=func, selected_args=args: selected_func(
+                image, *selected_args)))
+        # pylint:enable=g-long-lambda
+      image = tf.switch_case(
+          branch_index=op_to_select,
+          branch_fns=branch_fns,
+          default=lambda: tf.identity(image))
+    image = tf.cast(image, dtype=input_image_type)
+    return image
--- a/models-2.13.1/official/legacy/image_classification/augment_test.py
+++ b/models-2.13.1/official/legacy/image_classification/augment_test.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for autoaugment."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl.testing import parameterized
+import tensorflow as tf
+from official.legacy.image_classification import augment
+def get_dtype_test_cases():
+  return [
+      ('uint8', tf.uint8),
+      ('int32', tf.int32),
+      ('float16', tf.float16),
+      ('float32', tf.float32),
+  ]
+@parameterized.named_parameters(get_dtype_test_cases())
+class TransformsTest(parameterized.TestCase, tf.test.TestCase):
+  """Basic tests for fundamental transformations."""
+  def test_to_from_4d(self, dtype):
+    for shape in [(10, 10), (10, 10, 10), (10, 10, 10, 10)]:
+      original_ndims = len(shape)
+      image = tf.zeros(shape, dtype=dtype)
+      image_4d = augment.to_4d(image)
+      self.assertEqual(4, tf.rank(image_4d))
+      self.assertAllEqual(image, augment.from_4d(image_4d, original_ndims))
+  def test_transform(self, dtype):
+    image = tf.constant([[1, 2], [3, 4]], dtype=dtype)
+    self.assertAllEqual(
+        augment.transform(image, transforms=[1] * 8), [[4, 4], [4, 4]])
+  def test_translate(self, dtype):
+    image = tf.constant(
+        [[1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]], dtype=dtype)
+    translations = [-1, -1]
+    translated = augment.translate(image=image, translations=translations)
+    expected = [[1, 0, 1, 1], [0, 1, 0, 0], [1, 0, 1, 1], [1, 0, 1, 1]]
+    self.assertAllEqual(translated, expected)
+  def test_translate_shapes(self, dtype):
+    translation = [0, 0]
+    for shape in [(3, 3), (5, 5), (224, 224, 3)]:
+      image = tf.zeros(shape, dtype=dtype)
+      self.assertAllEqual(image, augment.translate(image, translation))
+  def test_translate_invalid_translation(self, dtype):
+    image = tf.zeros((1, 1), dtype=dtype)
+    invalid_translation = [[[1, 1]]]
+    with self.assertRaisesRegex(TypeError, 'rank 1 or 2'):
+      _ = augment.translate(image, invalid_translation)
+  def test_rotate(self, dtype):
+    image = tf.reshape(tf.cast(tf.range(9), dtype), (3, 3))
+    rotation = 90.
+    transformed = augment.rotate(image=image, degrees=rotation)
+    expected = [[2, 5, 8], [1, 4, 7], [0, 3, 6]]
+    self.assertAllEqual(transformed, expected)
+  def test_rotate_shapes(self, dtype):
+    degrees = 0.
+    for shape in [(3, 3), (5, 5), (224, 224, 3)]:
+      image = tf.zeros(shape, dtype=dtype)
+      self.assertAllEqual(image, augment.rotate(image, degrees))
+class AutoaugmentTest(tf.test.TestCase):
+  def test_autoaugment(self):
+    """Smoke test to be sure there are no syntax errors."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+    augmenter = augment.AutoAugment()
+    aug_image = augmenter.distort(image)
+    self.assertEqual((224, 224, 3), aug_image.shape)
+  def test_randaug(self):
+    """Smoke test to be sure there are no syntax errors."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+    augmenter = augment.RandAugment()
+    aug_image = augmenter.distort(image)
+    self.assertEqual((224, 224, 3), aug_image.shape)
+  def test_all_policy_ops(self):
+    """Smoke test to be sure all augmentation functions can execute."""
+    prob = 1
+    magnitude = 10
+    replace_value = [128] * 3
+    cutout_const = 100
+    translate_const = 250
+    image = tf.ones((224, 224, 3), dtype=tf.uint8)
+    for op_name in augment.NAME_TO_FUNC:
+      func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
+                                                 replace_value, cutout_const,
+                                                 translate_const)
+      image = func(image, *args)
+    self.assertEqual((224, 224, 3), image.shape)
+if __name__ == '__main__':
+  tf.test.main()
--- a/models-2.13.1/official/legacy/image_classification/callbacks.py
+++ b/models-2.13.1/official/legacy/image_classification/callbacks.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common modules for callbacks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from typing import Any, List, MutableMapping, Optional, Text
+from absl import logging
+import tensorflow as tf
+from official.modeling import optimization
+from official.utils.misc import keras_utils
+def get_callbacks(
+    model_checkpoint: bool = True,
+    include_tensorboard: bool = True,
+    time_history: bool = True,
+    track_lr: bool = True,
+    write_model_weights: bool = True,
+    apply_moving_average: bool = False,
+    initial_step: int = 0,
+    batch_size: int = 0,
+    log_steps: int = 0,
+    model_dir: Optional[str] = None,
+    backup_and_restore: bool = False) -> List[tf.keras.callbacks.Callback]:
+  """Get all callbacks."""
+  model_dir = model_dir or ''
+  callbacks = []
+  if model_checkpoint:
+    ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
+    callbacks.append(
+        tf.keras.callbacks.ModelCheckpoint(
+            ckpt_full_path, save_weights_only=True, verbose=1))
+  if backup_and_restore:
+    backup_dir = os.path.join(model_dir, 'tmp')
+    callbacks.append(
+        tf.keras.callbacks.experimental.BackupAndRestore(backup_dir))
+  if include_tensorboard:
+    callbacks.append(
+        CustomTensorBoard(
+            log_dir=model_dir,
+            track_lr=track_lr,
+            initial_step=initial_step,
+            write_images=write_model_weights,
+            profile_batch=0))
+  if time_history:
+    callbacks.append(
+        keras_utils.TimeHistory(
+            batch_size,
+            log_steps,
+            logdir=model_dir if include_tensorboard else None))
+  if apply_moving_average:
+    # Save moving average model to a different file so that
+    # we can resume training from a checkpoint
+    ckpt_full_path = os.path.join(model_dir, 'average',
+                                  'model.ckpt-{epoch:04d}')
+    callbacks.append(
+        AverageModelCheckpoint(
+            update_weights=False,
+            filepath=ckpt_full_path,
+            save_weights_only=True,
+            verbose=1))
+    callbacks.append(MovingAverageCallback())
+  return callbacks
+def get_scalar_from_tensor(t: tf.Tensor) -> int:
+  """Utility function to convert a Tensor to a scalar."""
+  t = tf.keras.backend.get_value(t)
+  if callable(t):
+    return t()
+  else:
+    return t
+class CustomTensorBoard(tf.keras.callbacks.TensorBoard):
+  """A customized TensorBoard callback that tracks additional datapoints.
+  Metrics tracked:
+  - Global learning rate
+  Attributes:
+    log_dir: the path of the directory where to save the log files to be parsed
+      by TensorBoard.
+    track_lr: `bool`, whether or not to track the global learning rate.
+    initial_step: the initial step, used for preemption recovery.
+    **kwargs: Additional arguments for backwards compatibility. Possible key is
+      `period`.
+  """
+  # TODO(b/146499062): track params, flops, log lr, l2 loss,
+  # classification loss
+  def __init__(self,
+               log_dir: str,
+               track_lr: bool = False,
+               initial_step: int = 0,
+               **kwargs):
+    super(CustomTensorBoard, self).__init__(log_dir=log_dir, **kwargs)
+    self.step = initial_step
+    self._track_lr = track_lr
+  def on_batch_begin(self,
+                     epoch: int,
+                     logs: Optional[MutableMapping[str, Any]] = None) -> None:
+    self.step += 1
+    if logs is None:
+      logs = {}
+    logs.update(self._calculate_metrics())
+    super(CustomTensorBoard, self).on_batch_begin(epoch, logs)
+  def on_epoch_begin(self,
+                     epoch: int,
+                     logs: Optional[MutableMapping[str, Any]] = None) -> None:
+    if logs is None:
+      logs = {}
+    metrics = self._calculate_metrics()
+    logs.update(metrics)
+    for k, v in metrics.items():
+      logging.info('Current %s: %f', k, v)
+    super(CustomTensorBoard, self).on_epoch_begin(epoch, logs)
+  def on_epoch_end(self,
+                   epoch: int,
+                   logs: Optional[MutableMapping[str, Any]] = None) -> None:
+    if logs is None:
+      logs = {}
+    metrics = self._calculate_metrics()
+    logs.update(metrics)
+    super(CustomTensorBoard, self).on_epoch_end(epoch, logs)
+  def _calculate_metrics(self) -> MutableMapping[str, Any]:
+    logs = {}
+    # TODO(b/149030439): disable LR reporting.
+    # if self._track_lr:
+    #   logs['learning_rate'] = self._calculate_lr()
+    return logs
+  def _calculate_lr(self) -> int:
+    """Calculates the learning rate given the current step."""
+    return get_scalar_from_tensor(
+        self._get_base_optimizer()._decayed_lr(var_dtype=tf.float32))  # pylint:disable=protected-access
+  def _get_base_optimizer(self) -> tf.keras.optimizers.Optimizer:
+    """Get the base optimizer used by the current model."""
+    optimizer = self.model.optimizer
+    # The optimizer might be wrapped by another class, so unwrap it
+    while hasattr(optimizer, '_optimizer'):
+      optimizer = optimizer._optimizer  # pylint:disable=protected-access
+    return optimizer
+class MovingAverageCallback(tf.keras.callbacks.Callback):
+  """A Callback to be used with a `ExponentialMovingAverage` optimizer.
+  Applies moving average weights to the model during validation time to test
+  and predict on the averaged weights rather than the current model weights.
+  Once training is complete, the model weights will be overwritten with the
+  averaged weights (by default).
+  Attributes:
+    overwrite_weights_on_train_end: Whether to overwrite the current model
+      weights with the averaged weights from the moving average optimizer.
+    **kwargs: Any additional callback arguments.
+  """
+  def __init__(self, overwrite_weights_on_train_end: bool = False, **kwargs):
+    super(MovingAverageCallback, self).__init__(**kwargs)
+    self.overwrite_weights_on_train_end = overwrite_weights_on_train_end
+  def set_model(self, model: tf.keras.Model):
+    super(MovingAverageCallback, self).set_model(model)
+    assert isinstance(self.model.optimizer,
+                      optimization.ExponentialMovingAverage)
+    self.model.optimizer.shadow_copy(self.model)
+  def on_test_begin(self, logs: Optional[MutableMapping[Text, Any]] = None):
+    self.model.optimizer.swap_weights()
+  def on_test_end(self, logs: Optional[MutableMapping[Text, Any]] = None):
+    self.model.optimizer.swap_weights()
+  def on_train_end(self, logs: Optional[MutableMapping[Text, Any]] = None):
+    if self.overwrite_weights_on_train_end:
+      self.model.optimizer.assign_average_vars(self.model.variables)
+class AverageModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
+  """Saves and, optionally, assigns the averaged weights.
+  Taken from tfa.callbacks.AverageModelCheckpoint.
+  Attributes:
+    update_weights: If True, assign the moving average weights to the model, and
+      save them. If False, keep the old non-averaged weights, but the saved
+      model uses the average weights. See `tf.keras.callbacks.ModelCheckpoint`
+      for the other args.
+  """
+  def __init__(self,
+               update_weights: bool,
+               filepath: str,
+               monitor: str = 'val_loss',
+               verbose: int = 0,
+               save_best_only: bool = False,
+               save_weights_only: bool = False,
+               mode: str = 'auto',
+               save_freq: str = 'epoch',
+               **kwargs):
+    self.update_weights = update_weights
+    super().__init__(filepath, monitor, verbose, save_best_only,
+                     save_weights_only, mode, save_freq, **kwargs)
+  def set_model(self, model):
+    if not isinstance(model.optimizer, optimization.ExponentialMovingAverage):
+      raise TypeError('AverageModelCheckpoint is only used when training'
+                      'with MovingAverage')
+    return super().set_model(model)
+  def _save_model(self, epoch, logs):
+    assert isinstance(self.model.optimizer,
+                      optimization.ExponentialMovingAverage)
+    if self.update_weights:
+      self.model.optimizer.assign_average_vars(self.model.variables)
+      return super()._save_model(epoch, logs)  # pytype: disable=attribute-error  # typed-keras
+    else:
+      # Note: `model.get_weights()` gives us the weights (non-ref)
+      # whereas `model.variables` returns references to the variables.
+      non_avg_weights = self.model.get_weights()
+      self.model.optimizer.assign_average_vars(self.model.variables)
+      # result is currently None, since `super._save_model` doesn't
+      # return anything, but this may change in the future.
+      result = super()._save_model(epoch, logs)  # pytype: disable=attribute-error  # typed-keras
+      self.model.set_weights(non_avg_weights)
+      return result
--- a/models-2.13.1/official/legacy/image_classification/classifier_trainer.py
+++ b/models-2.13.1/official/legacy/image_classification/classifier_trainer.py
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Runs an Image Classification model."""
+import os
+import pprint
+from typing import Any, Mapping, Optional, Text, Tuple
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+from official.common import distribute_utils
+from official.legacy.image_classification import callbacks as custom_callbacks
+from official.legacy.image_classification import dataset_factory
+from official.legacy.image_classification import optimizer_factory
+from official.legacy.image_classification.configs import base_configs
+from official.legacy.image_classification.configs import configs
+from official.legacy.image_classification.efficientnet import efficientnet_model
+from official.legacy.image_classification.resnet import common
+from official.legacy.image_classification.resnet import resnet_model
+from official.legacy.image_classification.vgg import vgg_model
+from official.modeling import hyperparams
+from official.modeling import performance
+from official.utils import hyperparams_flags
+from official.utils.misc import keras_utils
+def get_models() -> Mapping[str, tf.keras.Model]:
+  """Returns the mapping from model type name to Keras model."""
+  return {
+      'efficientnet': efficientnet_model.EfficientNet.from_name,
+      'resnet': resnet_model.resnet50,
+      'vgg': vgg_model.vgg16,
+  }
+def get_dtype_map() -> Mapping[str, tf.dtypes.DType]:
+  """Returns the mapping from dtype string representations to TF dtypes."""
+  return {
+      'float32': tf.float32,
+      'bfloat16': tf.bfloat16,
+      'float16': tf.float16,
+      'fp32': tf.float32,
+      'bf16': tf.bfloat16,
+  }
+def _get_metrics(one_hot: bool) -> Mapping[Text, Any]:
+  """Get a dict of available metrics to track."""
+  if one_hot:
+    return {
+        # (name, metric_fn)
+        'acc':
+            tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+        'accuracy':
+            tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+        'top_1':
+            tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+        'top_5':
+            tf.keras.metrics.TopKCategoricalAccuracy(
+                k=5, name='top_5_accuracy'),
+    }
+  else:
+    return {
+        # (name, metric_fn)
+        'acc':
+            tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+        'accuracy':
+            tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+        'top_1':
+            tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+        'top_5':
+            tf.keras.metrics.SparseTopKCategoricalAccuracy(
+                k=5, name='top_5_accuracy'),
+    }
+def get_image_size_from_model(
+    params: base_configs.ExperimentConfig) -> Optional[int]:
+  """If the given model has a preferred image size, return it."""
+  if params.model_name == 'efficientnet':
+    efficientnet_name = params.model.model_params.model_name
+    if efficientnet_name in efficientnet_model.MODEL_CONFIGS:
+      return efficientnet_model.MODEL_CONFIGS[efficientnet_name].resolution
+  return None
+def _get_dataset_builders(params: base_configs.ExperimentConfig,
+                          strategy: tf.distribute.Strategy,
+                          one_hot: bool) -> Tuple[Any, Any]:
+  """Create and return train and validation dataset builders."""
+  if one_hot:
+    logging.warning('label_smoothing > 0, so datasets will be one hot encoded.')
+  else:
+    logging.warning('label_smoothing not applied, so datasets will not be one '
+                    'hot encoded.')
+  num_devices = strategy.num_replicas_in_sync if strategy else 1
+  image_size = get_image_size_from_model(params)
+  dataset_configs = [params.train_dataset, params.validation_dataset]
+  builders = []
+  for config in dataset_configs:
+    if config is not None and config.has_data:
+      builder = dataset_factory.DatasetBuilder(
+          config,
+          image_size=image_size or config.image_size,
+          num_devices=num_devices,
+          one_hot=one_hot)
+    else:
+      builder = None
+    builders.append(builder)
+  return builders
+def get_loss_scale(params: base_configs.ExperimentConfig,
+                   fp16_default: float = 128.) -> float:
+  """Returns the loss scale for initializations."""
+  loss_scale = params.runtime.loss_scale
+  if loss_scale == 'dynamic':
+    return loss_scale
+  elif loss_scale is not None:
+    return float(loss_scale)
+  elif (params.train_dataset.dtype == 'float32' or
+        params.train_dataset.dtype == 'bfloat16'):
+    return 1.
+  else:
+    assert params.train_dataset.dtype == 'float16'
+    return fp16_default
+def _get_params_from_flags(flags_obj: flags.FlagValues):
+  """Get ParamsDict from flags."""
+  model = flags_obj.model_type.lower()
+  dataset = flags_obj.dataset.lower()
+  params = configs.get_config(model=model, dataset=dataset)
+  flags_overrides = {
+      'model_dir': flags_obj.model_dir,
+      'mode': flags_obj.mode,
+      'model': {
+          'name': model,
+      },
+      'runtime': {
+          'run_eagerly': flags_obj.run_eagerly,
+          'tpu': flags_obj.tpu,
+      },
+      'train_dataset': {
+          'data_dir': flags_obj.data_dir,
+      },
+      'validation_dataset': {
+          'data_dir': flags_obj.data_dir,
+      },
+      'train': {
+          'time_history': {
+              'log_steps': flags_obj.log_steps,
+          },
+      },
+  }
+  overriding_configs = (flags_obj.config_file, flags_obj.params_override,
+                        flags_overrides)
+  pp = pprint.PrettyPrinter()
+  logging.info('Base params: %s', pp.pformat(params.as_dict()))
+  for param in overriding_configs:
+    logging.info('Overriding params: %s', param)
+    params = hyperparams.override_params_dict(params, param, is_strict=True)
+  params.validate()
+  params.lock()
+  logging.info('Final model parameters: %s', pp.pformat(params.as_dict()))
+  return params
+def resume_from_checkpoint(model: tf.keras.Model, model_dir: str,
+                           train_steps: int) -> int:
+  """Resumes from the latest checkpoint, if possible.
+  Loads the model weights and optimizer settings from a checkpoint.
+  This function should be used in case of preemption recovery.
+  Args:
+    model: The model whose weights should be restored.
+    model_dir: The directory where model weights were saved.
+    train_steps: The number of steps to train.
+  Returns:
+    The epoch of the latest checkpoint, or 0 if not restoring.
+  """
+  logging.info('Load from checkpoint is enabled.')
+  latest_checkpoint = tf.train.latest_checkpoint(model_dir)
+  logging.info('latest_checkpoint: %s', latest_checkpoint)
+  if not latest_checkpoint:
+    logging.info('No checkpoint detected.')
+    return 0
+  logging.info('Checkpoint file %s found and restoring from '
+               'checkpoint', latest_checkpoint)
+  model.load_weights(latest_checkpoint)
+  initial_epoch = model.optimizer.iterations // train_steps
+  logging.info('Completed loading from checkpoint.')
+  logging.info('Resuming from epoch %d', initial_epoch)
+  return int(initial_epoch)
+def initialize(params: base_configs.ExperimentConfig,
+               dataset_builder: dataset_factory.DatasetBuilder):
+  """Initializes backend related initializations."""
+  keras_utils.set_session_config(enable_xla=params.runtime.enable_xla)
+  performance.set_mixed_precision_policy(dataset_builder.dtype)
+  if tf.config.list_physical_devices('GPU'):
+    data_format = 'channels_first'
+  else:
+    data_format = 'channels_last'
+  tf.keras.backend.set_image_data_format(data_format)
+  if params.runtime.run_eagerly:
+    # Enable eager execution to allow step-by-step debugging
+    tf.config.experimental_run_functions_eagerly(True)
+  if tf.config.list_physical_devices('GPU'):
+    if params.runtime.gpu_thread_mode:
+      keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=params.runtime.per_gpu_thread_count,
+          gpu_thread_mode=params.runtime.gpu_thread_mode,
+          num_gpus=params.runtime.num_gpus,
+          datasets_num_private_threads=params.runtime
+          .dataset_num_private_threads)  # pylint:disable=line-too-long
+    if params.runtime.batchnorm_spatial_persistent:
+      os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
+def define_classifier_flags():
+  """Defines common flags for image classification."""
+  hyperparams_flags.initialize_common_flags()
+  flags.DEFINE_string(
+      'data_dir', default=None, help='The location of the input data.')
+  flags.DEFINE_string(
+      'mode',
+      default=None,
+      help='Mode to run: `train`, `eval`, `train_and_eval` or `export`.')
+  flags.DEFINE_bool(
+      'run_eagerly',
+      default=None,
+      help='Use eager execution and disable autograph for debugging.')
+  flags.DEFINE_string(
+      'model_type',
+      default=None,
+      help='The type of the model, e.g. EfficientNet, etc.')
+  flags.DEFINE_string(
+      'dataset',
+      default=None,
+      help='The name of the dataset, e.g. ImageNet, etc.')
+  flags.DEFINE_integer(
+      'log_steps',
+      default=100,
+      help='The interval of steps between logging of batch level stats.')
+def serialize_config(params: base_configs.ExperimentConfig, model_dir: str):
+  """Serializes and saves the experiment config."""
+  params_save_path = os.path.join(model_dir, 'params.yaml')
+  logging.info('Saving experiment configuration to %s', params_save_path)
+  tf.io.gfile.makedirs(model_dir)
+  hyperparams.save_params_dict_to_yaml(params, params_save_path)
+def train_and_eval(
+    params: base_configs.ExperimentConfig,
+    strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]:
+  """Runs the train and eval path using compile/fit."""
+  logging.info('Running train and eval.')
+  distribute_utils.configure_cluster(params.runtime.worker_hosts,
+                                     params.runtime.task_index)
+  # Note: for TPUs, strategy and scope should be created before the dataset
+  strategy = strategy_override or distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu)
+  strategy_scope = distribute_utils.get_strategy_scope(strategy)
+  logging.info('Detected %d devices.',
+               strategy.num_replicas_in_sync if strategy else 1)
+  label_smoothing = params.model.loss.label_smoothing
+  one_hot = label_smoothing and label_smoothing > 0
+  builders = _get_dataset_builders(params, strategy, one_hot)
+  datasets = [
+      builder.build(strategy) if builder else None for builder in builders
+  ]
+  # Unpack datasets and builders based on train/val/test splits
+  train_builder, validation_builder = builders  # pylint: disable=unbalanced-tuple-unpacking
+  train_dataset, validation_dataset = datasets
+  train_epochs = params.train.epochs
+  train_steps = params.train.steps or train_builder.num_steps
+  validation_steps = params.evaluation.steps or validation_builder.num_steps
+  initialize(params, train_builder)
+  logging.info('Global batch size: %d', train_builder.global_batch_size)
+  with strategy_scope:
+    model_params = params.model.model_params.as_dict()
+    model = get_models()[params.model.name](**model_params)
+    learning_rate = optimizer_factory.build_learning_rate(
+        params=params.model.learning_rate,
+        batch_size=train_builder.global_batch_size,
+        train_epochs=train_epochs,
+        train_steps=train_steps)
+    optimizer = optimizer_factory.build_optimizer(
+        optimizer_name=params.model.optimizer.name,
+        base_learning_rate=learning_rate,
+        params=params.model.optimizer.as_dict(),
+        model=model)
+    optimizer = performance.configure_optimizer(
+        optimizer,
+        use_float16=train_builder.dtype == 'float16',
+        loss_scale=get_loss_scale(params))
+    metrics_map = _get_metrics(one_hot)
+    metrics = [metrics_map[metric] for metric in params.train.metrics]
+    steps_per_loop = train_steps if params.train.set_epoch_loop else 1
+    if one_hot:
+      loss_obj = tf.keras.losses.CategoricalCrossentropy(
+          label_smoothing=params.model.loss.label_smoothing)
+    else:
+      loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
+    model.compile(
+        optimizer=optimizer,
+        loss=loss_obj,
+        metrics=metrics,
+        steps_per_execution=steps_per_loop)
+    initial_epoch = 0
+    if params.train.resume_checkpoint:
+      initial_epoch = resume_from_checkpoint(
+          model=model, model_dir=params.model_dir, train_steps=train_steps)
+    callbacks = custom_callbacks.get_callbacks(
+        model_checkpoint=params.train.callbacks.enable_checkpoint_and_export,
+        include_tensorboard=params.train.callbacks.enable_tensorboard,
+        time_history=params.train.callbacks.enable_time_history,
+        track_lr=params.train.tensorboard.track_lr,
+        write_model_weights=params.train.tensorboard.write_model_weights,
+        initial_step=initial_epoch * train_steps,
+        batch_size=train_builder.global_batch_size,
+        log_steps=params.train.time_history.log_steps,
+        model_dir=params.model_dir,
+        backup_and_restore=params.train.callbacks.enable_backup_and_restore)
+  serialize_config(params=params, model_dir=params.model_dir)
+  if params.evaluation.skip_eval:
+    validation_kwargs = {}
+  else:
+    validation_kwargs = {
+        'validation_data': validation_dataset,
+        'validation_steps': validation_steps,
+        'validation_freq': params.evaluation.epochs_between_evals,
+    }
+  history = model.fit(
+      train_dataset,
+      epochs=train_epochs,
+      steps_per_epoch=train_steps,
+      initial_epoch=initial_epoch,
+      callbacks=callbacks,
+      verbose=2,
+      **validation_kwargs)
+  validation_output = None
+  if not params.evaluation.skip_eval:
+    validation_output = model.evaluate(
+        validation_dataset, steps=validation_steps, verbose=2)
+  # TODO(dankondratyuk): eval and save final test accuracy
+  stats = common.build_stats(history, validation_output, callbacks)
+  return stats
+def export(params: base_configs.ExperimentConfig):
+  """Runs the model export functionality."""
+  logging.info('Exporting model.')
+  model_params = params.model.model_params.as_dict()
+  model = get_models()[params.model.name](**model_params)
+  checkpoint = params.export.checkpoint
+  if checkpoint is None:
+    logging.info('No export checkpoint was provided. Using the latest '
+                 'checkpoint from model_dir.')
+    checkpoint = tf.train.latest_checkpoint(params.model_dir)
+  model.load_weights(checkpoint)
+  model.save(params.export.destination)
+def run(flags_obj: flags.FlagValues,
+        strategy_override: tf.distribute.Strategy = None) -> Mapping[str, Any]:
+  """Runs Image Classification model using native Keras APIs.
+  Args:
+    flags_obj: An object containing parsed flag values.
+    strategy_override: A `tf.distribute.Strategy` object to use for model.
+  Returns:
+    Dictionary of training/eval stats
+  """
+  params = _get_params_from_flags(flags_obj)
+  if params.mode == 'train_and_eval':
+    return train_and_eval(params, strategy_override)
+  elif params.mode == 'export_only':
+    export(params)
+  else:
+    raise ValueError('{} is not a valid mode.'.format(params.mode))
+def main(_):
+  stats = run(flags.FLAGS)
+  if stats:
+    logging.info('Run stats:\n%s', stats)
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  define_classifier_flags()
+  flags.mark_flag_as_required('data_dir')
+  flags.mark_flag_as_required('mode')
+  flags.mark_flag_as_required('model_type')
+  flags.mark_flag_as_required('dataset')
+  app.run(main)