Merge branch 'master' of https://github.com/tensorflow/models

2412b118 · Gunho Park · f7783e7a · 6dbdb08c · 2412b118 · 2412b118
Commit 2412b118 authored Jul 02, 2022 by Gunho Park
13 changed files
--- a/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_deeplab_test.py
+++ b/official/vision/beta/projects/panoptic_maskrcnn/configs/panoptic_deeplab_test.py
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for panoptic deeplab config."""
+# pylint: disable=unused-import
+from absl.testing import parameterized
+import tensorflow as tf
+from official.core import config_definitions as cfg
+from official.core import exp_factory
+from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_deeplab as exp_cfg
+class PanopticMaskRCNNConfigTest(tf.test.TestCase, parameterized.TestCase):
+  @parameterized.parameters(
+      ('panoptic_deeplab_resnet_coco', 'dilated_resnet'),
+      ('panoptic_deeplab_mobilenetv3_large_coco', 'mobilenet'),
+  )
+  def test_panoptic_deeplab_configs(self, config_name, backbone_type):
+    config = exp_factory.get_exp_config(config_name)
+    self.assertIsInstance(config, cfg.ExperimentConfig)
+    self.assertIsInstance(config.task, exp_cfg.PanopticDeeplabTask)
+    self.assertIsInstance(config.task.model, exp_cfg.PanopticDeeplab)
+    self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
+    self.assertEqual(config.task.model.backbone.type, backbone_type)
+    config.validate()
+    config.task.train_data.is_training = None
+    with self.assertRaisesRegex(KeyError, 'Found inconsistncy between key'):
+      config.validate()
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/vision/configs/experiments/image_classification/imagenet_mobilenetv3large_tpu.yaml
+++ b/official/vision/configs/experiments/image_classification/imagenet_mobilenetv3large_tpu.yaml
-# MobileNetV3-large_1.0 ImageNet classification: ~75.3% top-1.
+# MobileNetV3-large_1.0 ImageNet classification: ~75.7% top-1.
 runtime:
  distribution_strategy: 'tpu'
  mixed_precision_dtype: 'bfloat16'
@@ -27,10 +27,10 @@ task:
    dtype: 'bfloat16'
    aug_type:
      autoaug:
-        augmentation_name: v0
+        augmentation_name: 'v0'
        cutout_const: 100
        translate_const: 250
-      type: autoaug
+      type: 'autoaug'
  validation_data:
    input_path: 'imagenet-2012-tfrecord/valid*'
    is_training: false
@@ -38,7 +38,7 @@ task:
    dtype: 'bfloat16'
    drop_remainder: false
 trainer:
-  train_steps: 156000  # 500 epochs
+  train_steps: 218000  # 700 epochs
  validation_steps: 13
  validation_interval: 312
  steps_per_loop: 312  # NUM_EXAMPLES (1281167) // global_batch_size
@@ -48,7 +48,7 @@ trainer:
    learning_rate:
      cosine:
        alpha: 0.0
-        decay_steps: 156000
+        decay_steps: 218000
        initial_learning_rate: 0.004
        name: CosineDecay
        offset: 0

--- a/official/vision/configs/experiments/image_classification/imagenet_mobilenetv3small_tpu.yaml
+++ b/official/vision/configs/experiments/image_classification/imagenet_mobilenetv3small_tpu.yaml
-# MobileNetV3Small ImageNet classification. 67.5% top-1 and 87.6% top-5 accuracy.
+# MobileNetV3Small ImageNet classification. 67.5% top-1 and 87.7% top-5 accuracy.
 runtime:
  distribution_strategy: 'tpu'
  mixed_precision_dtype: 'bfloat16'
@@ -34,7 +34,7 @@ task:
    drop_remainder: false
 trainer:
  train_steps: 312000  # 1000 epochs
-  validation_steps: 12
+  validation_steps: 13
  validation_interval: 312
  steps_per_loop: 312  # NUM_EXAMPLES (1281167) // global_batch_size
  summary_interval: 312
@@ -49,7 +49,7 @@ trainer:
    learning_rate:
      type: 'exponential'
      exponential:
-        initial_learning_rate: 0.01
+        initial_learning_rate: 0.426  # 0.02 * (batch_size / 192)
        decay_steps: 936  # 3 * steps_per_epoch
        decay_rate: 0.99
        staircase: true
@@ -60,4 +60,4 @@ trainer:
      type: 'linear'
      linear:
        warmup_steps: 1560
-        warmup_learning_rate: 0.001
+        warmup_learning_rate: 0.0
--- a/official/vision/configs/retinanet.py
+++ b/official/vision/configs/retinanet.py
@@ -107,6 +107,7 @@ class RetinaNetHead(hyperparams.Config):
  num_filters: int = 256
  use_separable_conv: bool = False
  attribute_heads: List[AttributeHead] = dataclasses.field(default_factory=list)
+  share_classification_heads: bool = False
 @dataclasses.dataclass

--- a/official/vision/dataloaders/classification_input.py
+++ b/official/vision/dataloaders/classification_input.py
@@ -254,6 +254,11 @@ class Parser(parser.Parser):
    return image
+  def parse_train_image(self, decoded_tensors: Dict[str,
+                                                    tf.Tensor]) -> tf.Tensor:
+    """Public interface for parsing image data for training."""
+    return self._parse_train_image(decoded_tensors)
  @classmethod
  def inference_fn(cls,
                   image: tf.Tensor,

--- a/official/vision/modeling/factory.py
+++ b/official/vision/modeling/factory.py
@@ -293,6 +293,7 @@ def build_retinanet(
      attribute_heads=[
          cfg.as_dict() for cfg in (head_config.attribute_heads or [])
      ],
+      share_classification_heads=head_config.share_classification_heads,
      use_separable_conv=head_config.use_separable_conv,
      activation=norm_activation_config.activation,
      use_sync_bn=norm_activation_config.use_sync_bn,

--- a/official/vision/modeling/heads/dense_prediction_heads.py
+++ b/official/vision/modeling/heads/dense_prediction_heads.py
@@ -37,6 +37,7 @@ class RetinaNetHead(tf.keras.layers.Layer):
      num_convs: int = 4,
      num_filters: int = 256,
      attribute_heads: Optional[List[Dict[str, Any]]] = None,
+      share_classification_heads: bool = False,
      use_separable_conv: bool = False,
      activation: str = 'relu',
      use_sync_bn: bool = False,
@@ -62,6 +63,8 @@ class RetinaNetHead(tf.keras.layers.Layer):
        additional attribute head. Each dict consists of 3 key-value pairs:
        `name`, `type` ('regression' or 'classification'), and `size` (number
        of predicted values for each instance).
+      share_classification_heads: A `bool` that indicates whethere
+        sharing weights among the main and attribute classification heads.
      use_separable_conv: A `bool` that indicates whether the separable
        convolution layers is used.
      activation: A `str` that indicates which activation is used, e.g. 'relu',
@@ -88,6 +91,7 @@ class RetinaNetHead(tf.keras.layers.Layer):
        'num_convs': num_convs,
        'num_filters': num_filters,
        'attribute_heads': attribute_heads,
+        'share_classification_heads': share_classification_heads,
        'use_separable_conv': use_separable_conv,
        'activation': activation,
        'use_sync_bn': use_sync_bn,
@@ -216,7 +220,11 @@ class RetinaNetHead(tf.keras.layers.Layer):
          this_level_att_norms = []
          for i in range(self._config_dict['num_convs']):
            if level == self._config_dict['min_level']:
-              att_conv_name = '{}-conv_{}'.format(att_name, i)
+              if self._config_dict[
+                  'share_classification_heads'] and att_type == 'classification':
+                att_conv_name = 'classnet-conv_{}'.format(i)
+              else:
+                att_conv_name = '{}-conv_{}'.format(att_name, i)
              if 'kernel_initializer' in conv_kwargs:
                conv_kwargs['kernel_initializer'] = tf_utils.clone_initializer(
                    conv_kwargs['kernel_initializer'])

--- a/official/vision/modeling/heads/dense_prediction_heads_test.py
+++ b/official/vision/modeling/heads/dense_prediction_heads_test.py
@@ -25,14 +25,15 @@ from official.vision.modeling.heads import dense_prediction_heads
 class RetinaNetHeadTest(parameterized.TestCase, tf.test.TestCase):
  @parameterized.parameters(
-      (False, False, False),
+      (False, False, False, None, False),
-      (False, True, False),
+      (False, True, False, None, False),
-      (True, False, True),
+      (True, False, True, 'regression', False),
-      (True, True, True),
+      (True, True, True, 'classification', True),
  )
-  def test_forward(self, use_separable_conv, use_sync_bn, has_att_heads):
+  def test_forward(self, use_separable_conv, use_sync_bn, has_att_heads,
+                   att_type, share_classification_heads):
    if has_att_heads:
-      attribute_heads = [dict(name='depth', type='regression', size=1)]
+      attribute_heads = [dict(name='depth', type=att_type, size=1)]
    else:
      attribute_heads = None
@@ -44,6 +45,7 @@ class RetinaNetHeadTest(parameterized.TestCase, tf.test.TestCase):
        num_convs=2,
        num_filters=256,
        attribute_heads=attribute_heads,
+        share_classification_heads=share_classification_heads,
        use_separable_conv=use_separable_conv,
        activation='relu',
        use_sync_bn=use_sync_bn,

--- a/official/vision/modeling/maskrcnn_model.py
+++ b/official/vision/modeling/maskrcnn_model.py
@@ -158,7 +158,7 @@ class MaskRCNNModel(tf.keras.Model):
        matched_gt_classes=intermediate_outputs['matched_gt_classes'],
        gt_masks=gt_masks,
        training=training)
-    model_outputs.update(model_mask_outputs)
+    model_outputs.update(model_mask_outputs)  # pytype: disable=attribute-error  # dynamic-method-lookup
    return model_outputs
  def _get_backbone_and_decoder_features(self, images):

--- a/official/vision/ops/preprocess_ops.py
+++ b/official/vision/ops/preprocess_ops.py
@@ -638,6 +638,53 @@ def random_horizontal_flip(image, normalized_boxes=None, masks=None, seed=1):
    return image, normalized_boxes, masks
+def random_horizontal_flip_with_roi(
+    image: tf.Tensor,
+    boxes: Optional[tf.Tensor] = None,
+    masks: Optional[tf.Tensor] = None,
+    roi_boxes: Optional[tf.Tensor] = None,
+    seed: int = 1
+) -> Tuple[tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor],
+           Optional[tf.Tensor]]:
+  """Randomly flips input image and bounding boxes.
+  Extends preprocess_ops.random_horizontal_flip to also flip roi_boxes used
+  by ViLD.
+  Args:
+    image: `tf.Tensor`, the image to apply the random flip.
+    boxes: `tf.Tensor` or `None`, boxes corresponding to the image.
+    masks: `tf.Tensor` or `None`, masks corresponding to the image.
+    roi_boxes: `tf.Tensor` or `None`, RoIs corresponding to the image.
+    seed: Seed for Tensorflow's random number generator.
+  Returns:
+    image: `tf.Tensor`, flipped image.
+    boxes: `tf.Tensor` or `None`, flipped boxes corresponding to the image.
+    masks: `tf.Tensor` or `None`, flipped masks corresponding to the image.
+    roi_boxes: `tf.Tensor` or `None`, flipped RoIs corresponding to the image.
+  """
+  with tf.name_scope('random_horizontal_flip'):
+    do_flip = tf.greater(tf.random.uniform([], seed=seed), 0.5)
+    image = tf.cond(do_flip, lambda: horizontal_flip_image(image),
+                    lambda: image)
+    if boxes is not None:
+      boxes = tf.cond(do_flip, lambda: horizontal_flip_boxes(boxes),
+                      lambda: boxes)
+    if masks is not None:
+      masks = tf.cond(do_flip, lambda: horizontal_flip_masks(masks),
+                      lambda: masks)
+    if roi_boxes is not None:
+      roi_boxes = tf.cond(do_flip, lambda: horizontal_flip_boxes(roi_boxes),
+                          lambda: roi_boxes)
+    return image, boxes, masks, roi_boxes
 def color_jitter(image: tf.Tensor,
                 brightness: Optional[float] = 0.,
                 contrast: Optional[float] = 0.,

--- a/official/vision/ops/preprocess_ops_3d.py
+++ b/official/vision/ops/preprocess_ops_3d.py
@@ -18,8 +18,7 @@ from typing import Optional, Tuple
 import tensorflow as tf
-def _sample_or_pad_sequence_indices(sequence: tf.Tensor,
+def _sample_or_pad_sequence_indices(sequence: tf.Tensor, num_steps: int,
-                                    num_steps: int,
                                    stride: int,
                                    offset: tf.Tensor) -> tf.Tensor:
  """Returns indices to take for sampling or padding sequences to fixed size."""
@@ -28,18 +27,16 @@ def _sample_or_pad_sequence_indices(sequence: tf.Tensor,
  # Repeats sequence until num_steps are available in total.
  max_length = num_steps * stride + offset
-  num_repeats = tf.math.floordiv(
+  num_repeats = tf.math.floordiv(max_length + sequence_length - 1,
-      max_length + sequence_length - 1, sequence_length)
+                                 sequence_length)
  sel_idx = tf.tile(sel_idx, [num_repeats])
  steps = tf.range(offset, offset + num_steps * stride, stride)
  return tf.gather(sel_idx, steps)
-def sample_linspace_sequence(sequence: tf.Tensor,
+def sample_linspace_sequence(sequence: tf.Tensor, num_windows: int,
-                             num_windows: int,
+                             num_steps: int, stride: int) -> tf.Tensor:
-                             num_steps: int,
-                             stride: int) -> tf.Tensor:
  """Samples `num_windows` segments from sequence with linearly spaced offsets.
  The samples are concatenated in a single `tf.Tensor` in order to have the same
@@ -66,11 +63,12 @@ def sample_linspace_sequence(sequence: tf.Tensor,
  all_indices = []
  for i in range(num_windows):
-    all_indices.append(_sample_or_pad_sequence_indices(
+    all_indices.append(
-        sequence=sequence,
+        _sample_or_pad_sequence_indices(
-        num_steps=num_steps,
+            sequence=sequence,
-        stride=stride,
+            num_steps=num_steps,
-        offset=offsets[i]))
+            stride=stride,
+            offset=offsets[i]))
  indices = tf.concat(all_indices, axis=0)
  indices.set_shape((num_windows * num_steps,))
@@ -110,25 +108,76 @@ def sample_sequence(sequence: tf.Tensor,
        sequence_length > (num_steps - 1) * frame_stride,
        lambda: sequence_length - (num_steps - 1) * frame_stride,
        lambda: sequence_length)
-    offset = tf.random.uniform(
+    offset = tf.random.uniform((),
-        (),
+                               maxval=tf.cast(max_offset, dtype=tf.int32),
-        maxval=tf.cast(max_offset, dtype=tf.int32),
+                               dtype=tf.int32,
-        dtype=tf.int32,
+                               seed=seed)
-        seed=seed)
  else:
    offset = (sequence_length - num_steps * stride) // 2
    offset = tf.maximum(0, offset)
  indices = _sample_or_pad_sequence_indices(
-      sequence=sequence,
+      sequence=sequence, num_steps=num_steps, stride=stride, offset=offset)
-      num_steps=num_steps,
-      stride=stride,
-      offset=offset)
  indices.set_shape((num_steps,))
  return tf.gather(sequence, indices)
+def sample_segment_sequence(sequence: tf.Tensor,
+                            num_frames: int,
+                            is_training: bool,
+                            seed: Optional[int] = None) -> tf.Tensor:
+  """Samples a single segment of size `num_frames` from a given sequence.
+  This function follows the temporal segment network sampling style
+  (https://arxiv.org/abs/1608.00859). The video sequence would be divided into
+  `num_frames` non-overlapping segments with same length. If `is_training` is
+  `True`, we would randomly sampling one frame for each segment, and when
+  `is_training` is `False`, only the center frame of each segment is sampled.
+  Args:
+    sequence: Any tensor where the first dimension is timesteps.
+    num_frames: Number of frames to take.
+    is_training: A boolean indicating sampling in training or evaluation mode.
+    seed: A deterministic seed to use when sampling.
+  Returns:
+    A single `tf.Tensor` with first dimension `num_steps` with the sampled
+    segment.
+  """
+  sequence_length = tf.shape(sequence)[0]
+  sequence_length = tf.cast(sequence_length, tf.float32)
+  segment_length = tf.cast(sequence_length // num_frames, tf.float32)
+  segment_indices = tf.linspace(0.0, sequence_length, num_frames + 1)
+  segment_indices = tf.cast(segment_indices, tf.int32)
+  if is_training:
+    segment_length = tf.cast(segment_length, tf.int32)
+    # pylint:disable=g-long-lambda
+    segment_offsets = tf.cond(
+        segment_length == 0,
+        lambda: tf.zeros(shape=(num_frames,), dtype=tf.int32),
+        lambda: tf.random.uniform(
+            shape=(num_frames,),
+            minval=0,
+            maxval=segment_length,
+            dtype=tf.int32,
+            seed=seed))
+    # pylint:disable=g-long-lambda
+  else:
+    # Only sampling central frame during inference for being deterministic.
+    segment_offsets = tf.ones(
+        shape=(num_frames,), dtype=tf.int32) * tf.cast(
+            segment_length // 2, dtype=tf.int32)
+  indices = segment_indices[:-1] + segment_offsets
+  indices.set_shape((num_frames,))
+  return tf.gather(sequence, indices)
 def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
  """Decodes JPEG raw bytes string into a RGB uint8 Tensor.
@@ -144,7 +193,9 @@ def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
  """
  return tf.map_fn(
      lambda x: tf.image.decode_jpeg(x, channels=channels),
-      image_string, back_prop=False, dtype=tf.uint8)
+      image_string,
+      back_prop=False,
+      dtype=tf.uint8)
 def crop_image(frames: tf.Tensor,
@@ -229,8 +280,7 @@ def crop_image(frames: tf.Tensor,
  return frames
-def resize_smallest(frames: tf.Tensor,
+def resize_smallest(frames: tf.Tensor, min_resize: int) -> tf.Tensor:
-                    min_resize: int) -> tf.Tensor:
  """Resizes frames so that min(`height`, `width`) is equal to `min_resize`.
  This function will not do anything if the min(`height`, `width`) is already
@@ -255,18 +305,15 @@ def resize_smallest(frames: tf.Tensor,
    frames_resized = tf.image.resize(frames, (output_h, output_w))
    return tf.cast(frames_resized, frames.dtype)
-  should_resize = tf.math.logical_or(tf.not_equal(input_w, output_w),
+  should_resize = tf.math.logical_or(
-                                     tf.not_equal(input_h, output_h))
+      tf.not_equal(input_w, output_w), tf.not_equal(input_h, output_h))
  frames = tf.cond(should_resize, resize_fn, lambda: frames)
  return frames
-def random_crop_resize(frames: tf.Tensor,
+def random_crop_resize(frames: tf.Tensor, output_h: int, output_w: int,
-                       output_h: int,
+                       num_frames: int, num_channels: int,
-                       output_w: int,
-                       num_frames: int,
-                       num_channels: int,
                       aspect_ratio: Tuple[float, float],
                       area_range: Tuple[float, float]) -> tf.Tensor:
  """First crops clip with jittering and then resizes to (output_h, output_w).
@@ -279,6 +326,7 @@ def random_crop_resize(frames: tf.Tensor,
    num_channels: Number of channels of the clip.
    aspect_ratio: Float tuple with the aspect range for cropping.
    area_range: Float tuple with the area range for cropping.
  Returns:
    A Tensor of shape [timesteps, output_h, output_w, channels] of type
      frames.dtype.
@@ -299,21 +347,16 @@ def random_crop_resize(frames: tf.Tensor,
  bbox_begin, bbox_size, _ = sample_distorted_bbox
  offset_y, offset_x, _ = tf.unstack(bbox_begin)
  target_height, target_width, _ = tf.unstack(bbox_size)
-  size = tf.convert_to_tensor((
+  size = tf.convert_to_tensor((seq_len, target_height, target_width, channels))
-      seq_len, target_height, target_width, channels))
+  offset = tf.convert_to_tensor((0, offset_y, offset_x, 0))
-  offset = tf.convert_to_tensor((
-      0, offset_y, offset_x, 0))
  frames = tf.slice(frames, offset, size)
-  frames = tf.cast(
+  frames = tf.cast(tf.image.resize(frames, (output_h, output_w)), frames.dtype)
-      tf.image.resize(frames, (output_h, output_w)),
-      frames.dtype)
  frames.set_shape((num_frames, output_h, output_w, num_channels))
  return frames
-def random_flip_left_right(
+def random_flip_left_right(frames: tf.Tensor,
-    frames: tf.Tensor,
+                           seed: Optional[int] = None) -> tf.Tensor:
-    seed: Optional[int] = None) -> tf.Tensor:
  """Flips all the frames with a probability of 50%.
  Args:
@@ -324,12 +367,16 @@ def random_flip_left_right(
    A Tensor of shape [timesteps, output_h, output_w, channels] eventually
    flipped left right.
  """
-  is_flipped = tf.random.uniform(
+  is_flipped = tf.random.uniform((),
-      (), minval=0, maxval=2, dtype=tf.int32, seed=seed)
+                                 minval=0,
+                                 maxval=2,
-  frames = tf.cond(tf.equal(is_flipped, 1),
+                                 dtype=tf.int32,
-                   true_fn=lambda: tf.image.flip_left_right(frames),
+                                 seed=seed)
-                   false_fn=lambda: frames)
+  frames = tf.cond(
+      tf.equal(is_flipped, 1),
+      true_fn=lambda: tf.image.flip_left_right(frames),
+      false_fn=lambda: frames)
  return frames

--- a/official/vision/ops/preprocess_ops_3d_test.py
+++ b/official/vision/ops/preprocess_ops_3d_test.py
@@ -72,6 +72,16 @@ class ParserUtilsTest(tf.test.TestCase):
    self.assertBetween(offset_3, 0, 99)
    self.assertAllEqual(sampled_seq_3, range(offset_3, offset_3 + 10))
+  def test_sample_segment_sequence(self):
+    sequence = tf.range(100)
+    sampled_seq_1 = preprocess_ops_3d.sample_segment_sequence(
+        sequence, 10, False)
+    sampled_seq_2 = preprocess_ops_3d.sample_segment_sequence(
+        sequence, 10, True)
+    self.assertAllEqual(sampled_seq_1, [5 + i * 10 for i in range(10)])
+    for idx, v in enumerate(sampled_seq_2):
+      self.assertBetween(v - idx * 10, 0, 10)
  def test_decode_jpeg(self):
    # Create a random RGB JPEG image.
    random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)

--- a/research/object_detection/meta_architectures/center_net_meta_arch.py
+++ b/research/object_detection/meta_architectures/center_net_meta_arch.py
@@ -4235,6 +4235,15 @@ class CenterNetMetaArch(model.DetectionModel):
          axis=-2)
      multiclass_scores = postprocess_dict[
          fields.DetectionResultFields.detection_multiclass_scores]
+      num_classes = tf.shape(multiclass_scores)[2]
+      class_mask = tf.cast(
+          tf.one_hot(
+              postprocess_dict[fields.DetectionResultFields.detection_classes],
+              depth=num_classes), tf.bool)
+      # Surpress the scores of those unselected classes to be zeros. Otherwise,
+      # the downstream NMS ops might be confused and introduce issues.
+      multiclass_scores = tf.where(
+          class_mask, multiclass_scores, tf.zeros_like(multiclass_scores))
      num_valid_boxes = postprocess_dict.pop(
          fields.DetectionResultFields.num_detections)
      # Remove scores and classes as NMS will compute these form multiclass