resovle merge conflicts

31ca3b97 · Kaushik Shivakumar · 3e9d886d · 7fcd7cba · 31ca3b97 · 31ca3b97
Commit 31ca3b97 authored Jul 23, 2020 by Kaushik Shivakumar
20 changed files
--- a/research/object_detection/predictors/convolutional_keras_box_predictor.py
+++ b/research/object_detection/predictors/convolutional_keras_box_predictor.py
@@ -314,7 +314,8 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
      self, inserted_layer_counter, target_channel):
    projection_layers = []
    if inserted_layer_counter >= 0:
-      use_bias = False if self._apply_batch_norm else True
+      use_bias = False if (self._apply_batch_norm and not
+                           self._conv_hyperparams.force_use_bias()) else True
      projection_layers.append(keras.Conv2D(
          target_channel, [1, 1], strides=1, padding='SAME',
          name='ProjectionLayer/conv2d_{}'.format(inserted_layer_counter),
@@ -331,7 +332,8 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
    conv_layers = []
    batch_norm_layers = []
    activation_layers = []
-    use_bias = False if self._apply_batch_norm else True
+    use_bias = False if (self._apply_batch_norm and not
+                         self._conv_hyperparams.force_use_bias()) else True
    for additional_conv_layer_idx in range(self._num_layers_before_predictor):
      layer_name = '{}/conv2d_{}'.format(
          tower_name_scope, additional_conv_layer_idx)
@@ -363,7 +365,9 @@ class WeightSharedConvolutionalBoxPredictor(box_predictor.KerasBoxPredictor):
            training=(self._is_training and not self._freeze_batchnorm),
            name='{}/conv2d_{}/BatchNorm/feature_{}'.format(
                tower_name_scope, additional_conv_layer_idx, feature_index)))
-      activation_layers.append(tf.keras.layers.Lambda(tf.nn.relu6))
+      activation_layers.append(self._conv_hyperparams.build_activation_layer(
+          name='{}/conv2d_{}/activation_{}'.format(
+              tower_name_scope, additional_conv_layer_idx, feature_index)))
    # Set conv layers as the shared conv layers for different feature maps with
    # the same tower_name_scope.

--- a/research/object_detection/predictors/heads/head.py
+++ b/research/object_detection/predictors/heads/head.py
@@ -61,7 +61,7 @@ class Head(object):
    pass
-class KerasHead(tf.keras.Model):
+class KerasHead(tf.keras.layers.Layer):
  """Keras head base class."""
  def call(self, features):

--- a/research/object_detection/protos/center_net.proto
+++ b/research/object_detection/protos/center_net.proto
@@ -183,6 +183,41 @@ message CenterNet {
    optional float heatmap_bias_init = 3 [default = -2.19];
  }
  optional MaskEstimation mask_estimation_task = 8;
+  // Parameters which are related to DensePose estimation task.
+  // http://densepose.org/
+  message DensePoseEstimation {
+    // Weight of the task loss. The total loss of the model will be their
+    // summation of task losses weighted by the weights.
+    optional float task_loss_weight = 1 [default = 1.0];
+    // Class ID (0-indexed) that corresponds to the object in the label map that
+    // contains DensePose data.
+    optional int32 class_id = 2;
+    // Loss configuration for DensePose heatmap and regression losses. Note
+    // that the localization loss is used for surface coordinate losses and
+    // classification loss is used for part classification losses.
+    optional Loss loss = 3;
+    // The number of body parts.
+    optional int32 num_parts = 4 [default = 24];
+    // Loss weights for the two DensePose heads.
+    optional float part_loss_weight = 5 [default = 1.0];
+    optional float coordinate_loss_weight = 6 [default = 1.0];
+    // Whether to upsample the prediction feature maps back to the original
+    // input dimension prior to applying loss. This has the benefit of
+    // maintaining finer groundtruth location information.
+    optional bool upsample_to_input_res = 7 [default = true];
+    // The initial bias value of the convlution kernel of the class heatmap
+    // prediction head. -2.19 corresponds to predicting foreground with
+    // a probability of 0.1.
+    optional float heatmap_bias_init = 8 [default = -2.19];
+  }
+  optional DensePoseEstimation densepose_estimation_task = 9;
 }
 message CenterNetFeatureExtractor {

--- a/research/object_detection/protos/input_reader.proto
+++ b/research/object_detection/protos/input_reader.proto
@@ -31,7 +31,7 @@ enum InputType {
  TF_SEQUENCE_EXAMPLE = 2;        // TfSequenceExample Input
 }
-// Next id: 31
+// Next id: 33
 message InputReader {
  // Name of input reader. Typically used to describe the dataset that is read
  // by this input reader.
@@ -119,6 +119,10 @@ message InputReader {
  // Type of instance mask.
  optional InstanceMaskType mask_type = 10 [default = NUMERICAL_MASKS];
+  // Whether to load DensePose data. If set, must also set load_instance_masks
+  // to true.
+  optional bool load_dense_pose = 31 [default = false];
  // Whether to use the display name when decoding examples. This is only used
  // when mapping class text strings to integers.
  optional bool use_display_name = 17 [default = false];
@@ -129,6 +133,10 @@ message InputReader {
  // Whether input data type is tf.Examples or tf.SequenceExamples
  optional InputType input_type = 30 [default = TF_EXAMPLE];
+  // Which frame to choose from the input if Sequence Example. -1 indicates
+  // random choice.
+  optional int32 frame_index = 32 [default = -1];
  oneof input_reader {
    TFRecordInputReader tf_record_input_reader = 8;
    ExternalInputReader external_input_reader = 9;

--- a/research/object_detection/protos/preprocessor.proto
+++ b/research/object_detection/protos/preprocessor.proto
@@ -4,7 +4,7 @@ package object_detection.protos;
 // Message for defining a preprocessing operation on input data.
 // See: //third_party/tensorflow_models/object_detection/core/preprocessor.py
-// Next ID: 38
+// Next ID: 39
 message PreprocessingStep {
  oneof preprocessing_step {
    NormalizeImage normalize_image = 1;
@@ -44,6 +44,7 @@ message PreprocessingStep {
    RandomDownscaleToTargetPixels random_downscale_to_target_pixels = 35;
    RandomPatchGaussian random_patch_gaussian = 36;
    RandomSquareCropByScale random_square_crop_by_scale = 37;
+    RandomScaleCropAndPadToSquare random_scale_crop_and_pad_to_square = 38;
  }
 }
@@ -572,3 +573,20 @@ message RandomSquareCropByScale {
  // [min_scale, max_scale]
  optional int32 num_scales = 4 [default=8];
 }
+// Randomly scale, crop, and then pad an image to the desired square output
+// dimensions. Specifically, this method first samples a random_scale factor
+// from a uniform distribution between scale_min and scale_max, and then resizes
+// the image such that it's maximum dimension is (output_size * random_scale).
+// Secondly, a square output_size crop is extracted from the resized image, and
+// finally the cropped region is padded to the desired square output_size.
+// The augmentation is borrowed from [1]
+// [1]: https://arxiv.org/abs/1911.09070
+message RandomScaleCropAndPadToSquare {
+  // The (square) output image size
+  optional int32 output_size = 1 [default = 512];
+  // The minimum and maximum values from which to sample the random scale.
+  optional float scale_min = 2 [default=0.1];
+  optional float scale_max = 3 [default=2.0];
+}
--- a/research/object_detection/protos/ssd.proto
+++ b/research/object_detection/protos/ssd.proto
@@ -145,7 +145,7 @@ message Ssd {
  optional MaskHead mask_head_config = 25;
 }
-// Next id: 18.
+// Next id: 20.
 message SsdFeatureExtractor {
  reserved 6;
@@ -185,8 +185,13 @@ message SsdFeatureExtractor {
  // feature maps added by SSD.
  optional bool use_depthwise = 8 [default = false];
-  // Feature Pyramid Networks config.
+  oneof feature_pyramid_oneof {
-  optional FeaturePyramidNetworks fpn = 10;
+    // Feature Pyramid Networks config.
+    FeaturePyramidNetworks fpn = 10;
+    // Bidirectional Feature Pyramid Networks config.
+    BidirectionalFeaturePyramidNetworks bifpn = 19;
+  }
  // If true, replace preprocess function of feature extractor with a
  // placeholder. This should only be used if all the image preprocessing steps
@@ -225,3 +230,23 @@ message FeaturePyramidNetworks {
 }
+// Configuration for Bidirectional Feature Pyramid Networks.
+message BidirectionalFeaturePyramidNetworks {
+  // minimum level in the feature pyramid.
+  optional int32 min_level = 1 [default = 3];
+  // maximum level in the feature pyramid.
+  optional int32 max_level = 2 [default = 7];
+  // The number of repeated top-down bottom-up iterations for BiFPN-based
+  // feature extractors (bidirectional feature pyramid networks).
+  optional int32 num_iterations = 3;
+  // The number of filters (channels) to use in feature pyramid layers for
+  // BiFPN-based feature extractors (bidirectional feature pyramid networks).
+  optional int32 num_filters = 4;
+  // Method used to combine inputs to BiFPN nodes.
+  optional string combine_method = 5 [default = 'fast_attention'];
+}
--- a/research/object_detection/protos/train.proto
+++ b/research/object_detection/protos/train.proto
@@ -59,7 +59,8 @@ message TrainConfig {
  // Whether to load all checkpoint vars that match model variable names and
  // sizes. This option is only available if `from_detection_checkpoint` is
-  // True.
+  // True.  This option is *not* supported for TF2 --- setting it to true
+  // will raise an error.
  optional bool load_all_detection_checkpoint_vars = 19 [default = false];
  // Number of steps to train the DetectionModel for. If 0, will train the model

--- a/research/object_detection/samples/configs/context_rcnn_resnet101_snapshot_serengeti.config
+++ b/research/object_detection/samples/configs/context_rcnn_resnet101_snapshot_serengeti.config
+# Context R-CNN configuration for Snapshot Serengeti Dataset, with sequence
+# example input data with context_features.
+# This model uses attention into contextual features within the Faster R-CNN
+# object detection framework to improve object detection performance.
+# See https://arxiv.org/abs/1912.03538 for more information.
+# Search for "PATH_TO_BE_CONFIGURED" to find the fields that should be
+# configured.
+model {
+  faster_rcnn {
+    num_classes: 48
+    image_resizer {
+      fixed_shape_resizer {
+        height: 640
+        width: 640
+      }
+    }
+    feature_extractor {
+      type: "faster_rcnn_resnet101"
+      first_stage_features_stride: 16
+      batch_norm_trainable: true
+    }
+    first_stage_anchor_generator {
+      grid_anchor_generator {
+        height_stride: 16
+        width_stride: 16
+        scales: 0.25
+        scales: 0.5
+        scales: 1.0
+        scales: 2.0
+        aspect_ratios: 0.5
+        aspect_ratios: 1.0
+        aspect_ratios: 2.0
+      }
+    }
+    first_stage_box_predictor_conv_hyperparams {
+      op: CONV
+      regularizer {
+        l2_regularizer {
+          weight: 0.0
+        }
+      }
+      initializer {
+        truncated_normal_initializer {
+          stddev: 0.00999999977648
+        }
+      }
+    }
+    first_stage_nms_score_threshold: 0.0
+    first_stage_nms_iou_threshold: 0.699999988079
+    first_stage_max_proposals: 300
+    first_stage_localization_loss_weight: 2.0
+    first_stage_objectness_loss_weight: 1.0
+    initial_crop_size: 14
+    maxpool_kernel_size: 2
+    maxpool_stride: 2
+    second_stage_box_predictor {
+      mask_rcnn_box_predictor {
+        fc_hyperparams {
+          op: FC
+          regularizer {
+            l2_regularizer {
+              weight: 0.0
+            }
+          }
+          initializer {
+            variance_scaling_initializer {
+              factor: 1.0
+              uniform: true
+              mode: FAN_AVG
+            }
+          }
+        }
+        use_dropout: false
+        dropout_keep_probability: 1.0
+        share_box_across_classes: true
+      }
+    }
+    second_stage_post_processing {
+      batch_non_max_suppression {
+        score_threshold: 0.0
+        iou_threshold: 0.600000023842
+        max_detections_per_class: 100
+        max_total_detections: 300
+      }
+      score_converter: SOFTMAX
+    }
+    second_stage_localization_loss_weight: 2.0
+    second_stage_classification_loss_weight: 1.0
+    use_matmul_crop_and_resize: true
+    clip_anchors_to_image: true
+    use_matmul_gather_in_matcher: true
+    use_static_balanced_label_sampler: true
+    use_static_shapes: true
+    context_config {
+      max_num_context_features: 2000
+      context_feature_length: 2057
+    }
+  }
+}
+train_config {
+  batch_size: 8
+  data_augmentation_options {
+    random_horizontal_flip {
+    }
+  }
+  sync_replicas: true
+  optimizer {
+    momentum_optimizer {
+      learning_rate {
+        manual_step_learning_rate {
+          initial_learning_rate: 0.0
+          schedule {
+            step: 400000
+            learning_rate: 0.002
+          }
+          schedule {
+            step: 500000
+            learning_rate: 0.0002
+          }
+          schedule {
+            step: 600000
+            learning_rate: 0.00002
+          }
+          warmup: true
+        }
+      }
+      momentum_optimizer_value: 0.9
+    }
+    use_moving_average: false
+  }
+  gradient_clipping_by_norm: 10.0
+  fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/faster_rcnn_resnet101_coco_2018_08_14/model.ckpt"
+  from_detection_checkpoint: true
+  num_steps: 5000000
+  replicas_to_aggregate: 8
+  max_number_of_boxes: 100
+  unpad_groundtruth_tensors: false
+  use_bfloat16: true
+}
+train_input_reader {
+  label_map_path: "PATH_TO_BE_CONFIGURED/ss_label_map.pbtxt"
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/snapshot_serengeti_train-?????-of-?????"
+  }
+  load_context_features: true
+  input_type: TF_SEQUENCE_EXAMPLE
+}
+eval_config {
+  max_evals: 50
+  metrics_set: "coco_detection_metrics"
+  use_moving_averages: false
+  batch_size: 1
+}
+eval_input_reader {
+  label_map_path: "PATH_TO_BE_CONFIGURED/ss_label_map.pbtxt"
+  shuffle: false
+  num_epochs: 1
+  tf_record_input_reader {
+    input_path: "PATH_TO_BE_CONFIGURED/snapshot_serengeti_val-?????-of-?????"
+  }
+  load_context_features: true
+  input_type: TF_SEQUENCE_EXAMPLE
+}
--- a/research/object_detection/test_images/ducky/test/out1.jpg
+++ b/research/object_detection/test_images/ducky/test/out1.jpg
--- a/research/object_detection/test_images/ducky/test/out10.jpg
+++ b/research/object_detection/test_images/ducky/test/out10.jpg
--- a/research/object_detection/test_images/ducky/test/out11.jpg
+++ b/research/object_detection/test_images/ducky/test/out11.jpg
--- a/research/object_detection/test_images/ducky/test/out12.jpg
+++ b/research/object_detection/test_images/ducky/test/out12.jpg
--- a/research/object_detection/test_images/ducky/test/out13.jpg
+++ b/research/object_detection/test_images/ducky/test/out13.jpg
--- a/research/object_detection/test_images/ducky/test/out14.jpg
+++ b/research/object_detection/test_images/ducky/test/out14.jpg
--- a/research/object_detection/test_images/ducky/test/out15.jpg
+++ b/research/object_detection/test_images/ducky/test/out15.jpg
--- a/research/object_detection/test_images/ducky/test/out16.jpg
+++ b/research/object_detection/test_images/ducky/test/out16.jpg
--- a/research/object_detection/test_images/ducky/test/out17.jpg
+++ b/research/object_detection/test_images/ducky/test/out17.jpg
--- a/research/object_detection/test_images/ducky/test/out18.jpg
+++ b/research/object_detection/test_images/ducky/test/out18.jpg
--- a/research/object_detection/test_images/ducky/test/out19.jpg
+++ b/research/object_detection/test_images/ducky/test/out19.jpg
--- a/research/object_detection/test_images/ducky/test/out2.jpg
+++ b/research/object_detection/test_images/ducky/test/out2.jpg