Merge branch 'tensorflow:master' into panoptic-deeplab-modeling

7479dbb8 · Srihari Humbarwadi · GitHub · 8b60a5a8 · 9c8cbd0c · 7479dbb8
Unverified Commit 7479dbb8 authored Feb 15, 2022 by Srihari Humbarwadi Committed by GitHub Feb 15, 2022
7 changed files
--- a/research/object_detection/configs/tf2/center_net_deepmac_512x512_voc_only_tpu-32.config
+++ b/research/object_detection/configs/tf2/center_net_deepmac_512x512_voc_only_tpu-32.config
@@ -3,7 +3,6 @@
 # mask head. This config is only trained on masks from the VOC classes in COCO
 # and achieves a mask mAP of 32.5% on non-VOC classes.
 # [1]: https://arxiv.org/abs/2104.00613
-# [2]: https://arxiv.org/abs/1904.07850
 # Train on TPU-32
@@ -55,6 +54,7 @@ model {
      classification_loss {
        weighted_sigmoid {}
      }
+      use_only_last_stage: true
      allowed_masked_classes_ids: [
        1,   # person

--- a/research/object_detection/dataset_tools/seq_example_util.py
+++ b/research/object_detection/dataset_tools/seq_example_util.py
@@ -32,6 +32,8 @@ def context_float_feature(ndarray):
  """
  feature = tf.train.Feature()
  for val in ndarray:
+    if isinstance(val, np.ndarray):
+      val = val.item()
    feature.float_list.value.append(val)
  return feature
@@ -47,6 +49,8 @@ def context_int64_feature(ndarray):
  """
  feature = tf.train.Feature()
  for val in ndarray:
+    if isinstance(val, np.ndarray):
+      val = val.item()
    feature.int64_list.value.append(val)
  return feature
@@ -81,7 +85,7 @@ def sequence_float_feature(ndarray):
  for row in ndarray:
    feature = feature_list.feature.add()
    if row.size:
-      feature.float_list.value[:] = row
+      feature.float_list.value[:] = np.ravel(row)
  return feature_list
@@ -98,7 +102,7 @@ def sequence_int64_feature(ndarray):
  for row in ndarray:
    feature = feature_list.feature.add()
    if row.size:
-      feature.int64_list.value[:] = row
+      feature.int64_list.value[:] = np.ravel(row)
  return feature_list
@@ -118,7 +122,7 @@ def sequence_bytes_feature(ndarray):
    feature = feature_list.feature.add()
    if row:
      row = [tf.compat.as_bytes(val) for val in row]
-      feature.bytes_list.value[:] = row
+      feature.bytes_list.value[:] = np.ravel(row)
  return feature_list

--- a/research/object_detection/model_lib_v2.py
+++ b/research/object_detection/model_lib_v2.py
@@ -1164,3 +1164,7 @@ def eval_continuously(
          postprocess_on_cpu=postprocess_on_cpu,
          global_step=global_step,
          )
+    if global_step.numpy() == configs['train_config'].num_steps:
+      tf.logging.info('Exiting evaluation at step %d', global_step.numpy())
+      return
--- a/research/object_detection/models/keras_models/resnet_v1.py
+++ b/research/object_detection/models/keras_models/resnet_v1.py
@@ -19,13 +19,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from tensorflow.python.keras.applications import resnet
 import tensorflow.compat.v1 as tf
 from object_detection.core import freezable_batch_norm
 from object_detection.models.keras_models import model_utils
+try:
+  from keras.applications import resnet  # pylint:disable=g-import-not-at-top
+except ImportError:
+  from tensorflow.python.keras.applications import resnet  # pylint:disable=g-import-not-at-top
 def _fixed_padding(inputs, kernel_size, rate=1):  # pylint: disable=invalid-name
  """Pads the input along the spatial dimensions independently of input size.

--- a/research/object_detection/protos/center_net.proto
+++ b/research/object_detection/protos/center_net.proto
@@ -220,8 +220,8 @@ message CenterNet {
    //   scores * exp((-distances^2) / (2 * sigma^2))
    // where 'distances' is the distance between the heatmap peak location and
    // the regressed joint location and 'sigma' is the Gaussian standard
-    // deviation used in generating the Gaussian heatmap target multiplied by the
+    // deviation used in generating the Gaussian heatmap target multiplied by
-    // 'std_dev_multiplier'.
+    // the 'std_dev_multiplier'.
    optional float std_dev_multiplier = 29 [default = 1.0];
    // The radius (in the unit of output pixel) around heatmap peak to assign
@@ -415,7 +415,7 @@ message CenterNet {
    optional int32 dim = 3 [default = 256];
    // The dimension of the per-pixel embedding
-    optional int32 pixel_embedding_dim = 4 [default=16];
+    optional int32 pixel_embedding_dim = 4 [default = 16];
    // If set, masks are only kept for classes listed here. Masks are deleted
    // for all other classes. Note that this is only done at training time, eval
@@ -424,75 +424,86 @@ message CenterNet {
    // The size of cropped pixel embedding that goes into the 2D mask prediction
    // network (RoI align).
-    optional int32 mask_size = 6 [default=32];
+    optional int32 mask_size = 6 [default = 32];
    // If set to a positive value, we subsample instances by this amount to
    // save memory during training.
-    optional int32 mask_num_subsamples = 67[default=-1];
+    optional int32 mask_num_subsamples = 67 [default = -1];
    // Whether or not to use (x, y) coordinates as input to mask net.
-    optional bool use_xy = 8 [default=true];
+    optional bool use_xy = 8 [default = true];
    // Defines the kind of architecture we want to use for mask network.
-    optional string network_type = 9 [default="hourglass52"];
+    optional string network_type = 9 [default = "hourglass52"];
    // Whether or not we want to use instance embedding in mask network.
-    optional bool use_instance_embedding = 10 [default=true];
+    optional bool use_instance_embedding = 10 [default = true];
    // Number of channels in the inital block of the mask prediction network.
-    optional int32 num_init_channels = 11 [default=64];
+    optional int32 num_init_channels = 11 [default = 64];
    // Whether or not to predict masks at full resolution. If true, we predict
    // masks at the resolution of the output stride. Otherwise, masks are
    // predicted at resolution defined by mask_size
-    optional bool predict_full_resolution_masks = 12 [default=false];
+    optional bool predict_full_resolution_masks = 12 [default = false];
    // If predict_full_resolution_masks is set, this parameter controls the size
    // of cropped masks returned by post-process. To be compatible with the rest
    // of the API, masks are always cropped and resized according to detected
    // boxes in postprocess.
-    optional int32 postprocess_crop_size = 13 [default=256];
+    optional int32 postprocess_crop_size = 13 [default = 256];
    // The maximum relative amount by which boxes will be jittered before
    // RoI crop happens. The x and y coordinates of the box are jittered
    // relative to width and height respectively.
-    optional float max_roi_jitter_ratio = 14 [default=0.0];
+    optional float max_roi_jitter_ratio = 14 [default = 0.0];
    // The mode for jitterting box ROIs. See RandomJitterBoxes in
    // preprocessor.proto for more details
-    optional RandomJitterBoxes.JitterMode jitter_mode = 15 [default=DEFAULT];
+    optional RandomJitterBoxes.JitterMode jitter_mode = 15 [default = DEFAULT];
    // Weight for the box consistency loss as described in the BoxInst paper
    // https://arxiv.org/abs/2012.02310
-    optional float box_consistency_loss_weight = 16 [default=0.0];
+    optional float box_consistency_loss_weight = 16 [default = 0.0];
-    optional float color_consistency_threshold = 17 [default=0.4];
+    optional float color_consistency_threshold = 17 [default = 0.4];
-    optional int32 color_consistency_dilation = 18 [default=2];
+    optional int32 color_consistency_dilation = 18 [default = 2];
-    optional float color_consistency_loss_weight = 19 [default=0.0];
+    optional float color_consistency_loss_weight = 19 [default = 0.0];
-    optional LossNormalize box_consistency_loss_normalize = 20 [
+    optional LossNormalize box_consistency_loss_normalize = 20
-      default=NORMALIZE_AUTO];
+        [default = NORMALIZE_AUTO];
    // If set, will use the bounding box tightness prior approach. This means
    // that the max will be restricted to only be inside the box for both
    // dimensions. See details here:
    // https://papers.nips.cc/paper/2019/hash/e6e713296627dff6475085cc6a224464-Abstract.html
-    optional bool box_consistency_tightness = 21 [default=false];
+    optional bool box_consistency_tightness = 21 [default = false];
-    optional int32 color_consistency_warmup_steps = 22 [default=0];
+    optional int32 color_consistency_warmup_steps = 22 [default = 0];
-    optional int32 color_consistency_warmup_start = 23 [default=0];
+    optional int32 color_consistency_warmup_start = 23 [default = 0];
+    // This flag controls whether or not we use the outputs from only the
+    // last stage of the hourglass for training the mask-heads.
    // DeepMAC has been refactored to process the entire batch at once,
    // instead of the previous (simple) approach of processing one sample at
-    // a time. Because of this, the memory consumption has increased and
+    // a time. Because of this, we need to set this flag to continue using
-    // it's crucial to only feed the mask head the last stage outputs
+    // the old models with the same training hardware.
-    // from the hourglass. Doing so halves the memory requirement of the
-    // mask head and does not cause a drop in evaluation metrics.
+    // This flag is not needed for 1024x1024 models. The performance and
-    optional bool use_only_last_stage = 24 [default=false];
+    // memory usage are same as before.
+    // For 512x512 models
+    // - Setting this flag to true will let the model train on TPU-v3 32
+    //   chips. We observed a small (0.26 mAP) performance drop when doing so.
+    // - Setting this flag to false (default) increases the TPU requirement
+    //   to TPU-v3 128 and reproduces previously demonstrated performance
+    //   within error bars.
+    optional bool use_only_last_stage = 24 [default = false];
  }
  optional DeepMACMaskEstimation deepmac_mask_estimation = 14;
@@ -506,7 +517,7 @@ message CenterNet {
 }
 enum LossNormalize {
-  NORMALIZE_AUTO = 0; // SUM for 2D inputs (dice loss) and MEAN for others.
+  NORMALIZE_AUTO = 0;  // SUM for 2D inputs (dice loss) and MEAN for others.
  NORMALIZE_GROUNDTRUTH_COUNT = 1;
  NORMALIZE_BALANCED = 3;
 }
@@ -547,4 +558,3 @@ message CenterNetFeatureExtractor {
  optional string upsampling_interpolation = 11 [default = 'nearest'];
 }
--- a/tensorflow_models/__init__.py
+++ b/tensorflow_models/__init__.py
@@ -20,4 +20,4 @@ from tensorflow_models import vision
 from official import core
 from official.modeling import hyperparams
 from official.modeling import optimization
-from official.modeling import tf_utils
+from official.modeling import tf_utils as utils
--- a/tensorflow_models/vision/__init__.py
+++ b/tensorflow_models/vision/__init__.py
@@ -13,5 +13,5 @@
 # limitations under the License.
 """TensorFlow Models Vision Libraries."""
-from official.vision.beta import configs
+from official.vision import configs
-from official.vision.beta.modeling import *
+from official.vision.modeling import *