Merge remote-tracking branch 'upstream/master' into add_multilevel_crop_and_resize

47bc1813 · syiming · d8611151 · b035a227 · 47bc1813 · 47bc1813
Commit 47bc1813 authored Jul 01, 2020 by syiming
8 changed files
--- a/research/object_detection/predictors/rfcn_keras_box_predictor_test.py
+++ b/research/object_detection/predictors/rfcn_keras_box_predictor_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================

 """Tests for object_detection.predictors.rfcn_box_predictor."""
+import unittest
 import numpy as np
 import tensorflow.compat.v1 as tf

@@ -22,8 +23,10 @@ from object_detection.builders import hyperparams_builder
 from object_detection.predictors import rfcn_keras_box_predictor as box_predictor
 from object_detection.protos import hyperparams_pb2
 from object_detection.utils import test_case
+from object_detection.utils import tf_version


+@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class RfcnKerasBoxPredictorTest(test_case.TestCase):

  def _build_conv_hyperparams(self):
@@ -42,8 +45,6 @@ class RfcnKerasBoxPredictorTest(test_case.TestCase):
    return hyperparams_builder.KerasLayerHyperparams(conv_hyperparams)

  def test_get_correct_box_encoding_and_class_prediction_shapes(self):
-
-    def graph_fn(image_features, proposal_boxes):
    rfcn_box_predictor = box_predictor.RfcnKerasBoxPredictor(
        is_training=False,
        num_classes=2,
@@ -52,8 +53,9 @@ class RfcnKerasBoxPredictorTest(test_case.TestCase):
        num_spatial_bins=[3, 3],
        depth=4,
        crop_size=[12, 12],
-          box_code_size=4
-      )
+        box_code_size=4)
+    def graph_fn(image_features, proposal_boxes):
+
      box_predictions = rfcn_box_predictor(
          [image_features],
          proposal_boxes=proposal_boxes)

--- a/research/object_detection/protos/center_net.proto
+++ b/research/object_detection/protos/center_net.proto
+syntax = "proto2";
+
+package object_detection.protos;
+
+import "object_detection/protos/image_resizer.proto";
+import "object_detection/protos/losses.proto";
+
+// Configuration for the CenterNet meta architecture from the "Objects as
+// Points" paper [1]
+// [1]: https://arxiv.org/abs/1904.07850
+
+message CenterNet {
+  // Number of classes to predict.
+  optional int32 num_classes = 1;
+
+  // Feature extractor config.
+  optional CenterNetFeatureExtractor feature_extractor = 2;
+
+  // Image resizer for preprocessing the input image.
+  optional ImageResizer image_resizer = 3;
+
+  // Parameters which are related to object detection task.
+  message ObjectDetection {
+    // The original fields are moved to ObjectCenterParams or deleted.
+    reserved 2, 5, 6, 7;
+
+    // Weight of the task loss. The total loss of the model will be the
+    // summation of task losses weighted by the weights.
+    optional float task_loss_weight = 1 [default = 1.0];
+
+    // Weight for the offset localization loss.
+    optional float offset_loss_weight = 3 [default = 1.0];
+
+    // Weight for the height/width localization loss.
+    optional float scale_loss_weight = 4 [default = 0.1];
+
+    // Localization loss configuration for object scale and offset losses.
+    optional LocalizationLoss localization_loss = 8;
+  }
+  optional ObjectDetection object_detection_task = 4;
+
+  // Parameters related to object center prediction. This is required for both
+  // object detection and keypoint estimation tasks.
+  message ObjectCenterParams {
+    // Weight for the object center loss.
+    optional float object_center_loss_weight = 1 [default = 1.0];
+
+    // Classification loss configuration for object center loss.
+    optional ClassificationLoss classification_loss = 2;
+
+    // The initial bias value of the convlution kernel of the class heatmap
+    // prediction head. -2.19 corresponds to predicting foreground with
+    // a probability of 0.1. See "Focal Loss for Dense Object Detection"
+    // at https://arxiv.org/abs/1708.02002.
+    optional float heatmap_bias_init = 3 [default = -2.19];
+
+    // The minimum IOU overlap boxes need to have to not be penalized.
+    optional float min_box_overlap_iou = 4 [default = 0.7];
+
+    // Maximum number of boxes to predict.
+    optional int32 max_box_predictions = 5 [default = 100];
+
+    // If set, loss is only computed for the labeled classes.
+    optional bool use_labeled_classes = 6 [default = false];
+  }
+  optional ObjectCenterParams object_center_params = 5;
+
+  // Path of the file that conatins the label map along with the keypoint
+  // information, including the keypoint indices, corresponding labels, and the
+  // corresponding class. The file should be the same one as used in the input
+  // pipeline. Note that a plain text of StringIntLabelMap proto is expected in
+  // this file.
+  // It is required only if the keypoint estimation task is specified.
+  optional string keypoint_label_map_path = 6;
+
+  // Parameters which are related to keypoint estimation task.
+  message KeypointEstimation {
+    // Name of the task, e.g. "human pose". Note that the task name should be
+    // unique to each keypoint task.
+    optional string task_name = 1;
+
+    // Weight of the task loss. The total loss of the model will be their
+    // summation of task losses weighted by the weights.
+    optional float task_loss_weight = 2 [default = 1.0];
+
+    // Loss configuration for keypoint heatmap, offset, regression losses. Note
+    // that the localization loss is used for offset/regression losses and
+    // classification loss is used for heatmap loss.
+    optional Loss loss = 3;
+
+    // The name of the class that contains the keypoints for this task. This is
+    // used to retrieve the corresponding keypoint indices from the label map.
+    // Note that this corresponds to the "name" field, not "display_name".
+    optional string keypoint_class_name = 4;
+
+    // The standard deviation of the Gaussian kernel used to generate the
+    // keypoint heatmap. The unit is the pixel in the output image. It is to
+    // provide the flexibility of using different sizes of Gaussian kernel for
+    // each keypoint class. Note that if provided, the keypoint standard
+    // deviations will be overridden by the specified values here, otherwise,
+    // the default value 5.0 will be used.
+    // TODO(yuhuic): Update the default value once we found the best value.
+    map<string, float> keypoint_label_to_std = 5;
+
+    // Loss weights corresponding to different heads.
+    optional float keypoint_regression_loss_weight = 6 [default = 1.0];
+    optional float keypoint_heatmap_loss_weight = 7 [default = 1.0];
+    optional float keypoint_offset_loss_weight = 8 [default = 1.0];
+
+    // The initial bias value of the convolution kernel of the keypoint heatmap
+    // prediction head. -2.19 corresponds to predicting foreground with
+    // a probability of 0.1. See "Focal Loss for Dense Object Detection"
+    // at https://arxiv.org/abs/1708.02002.
+    optional float heatmap_bias_init = 9 [default = -2.19];
+
+    // The heatmap score threshold for a keypoint to become a valid candidate.
+    optional float keypoint_candidate_score_threshold = 10 [default = 0.1];
+
+    // The maximum number of candidates to retrieve for each keypoint.
+    optional int32 num_candidates_per_keypoint = 11 [default = 100];
+
+    // Max pool kernel size to use to pull off peak score locations in a
+    // neighborhood (independently for each keypoint types).
+    optional int32 peak_max_pool_kernel_size = 12 [default = 3];
+
+    // The default score to use for regressed keypoints that are not
+    // successfully snapped to a nearby candidate.
+    optional float unmatched_keypoint_score = 13 [default = 0.1];
+
+    // The multiplier to expand the bounding boxes (either the provided boxes or
+    // those which tightly cover the regressed keypoints). Note that new
+    // expanded box for an instance becomes the feasible search window for all
+    // associated keypoints.
+    optional float box_scale = 14 [default = 1.2];
+
+    // The scale parameter that multiplies the largest dimension of a bounding
+    // box. The resulting distance becomes a search radius for candidates in the
+    // vicinity of each regressed keypoint.
+    optional float candidate_search_scale = 15 [default = 0.3];
+
+    // One of ['min_distance', 'score_distance_ratio'] indicating how to select
+    // the keypoint candidate.
+    optional string candidate_ranking_mode = 16 [default = "min_distance"];
+
+    // The radius (in the unit of output pixel) around heatmap peak to assign
+    // the offset targets. If set 0, then the offset target will only be
+    // assigned to the heatmap peak (same behavior as the original paper).
+    optional int32 offset_peak_radius = 17 [default = 0];
+
+    // Indicates whether to assign offsets for each keypoint channel
+    // separately. If set False, the output offset target has the shape
+    // [batch_size, out_height, out_width, 2] (same behavior as the original
+    // paper). If set True, the output offset target has the shape [batch_size,
+    // out_height, out_width, 2 * num_keypoints] (recommended when the
+    // offset_peak_radius is not zero).
+    optional bool per_keypoint_offset = 18 [default = false];
+  }
+  repeated KeypointEstimation keypoint_estimation_task = 7;
+
+  // Parameters which are related to mask estimation task.
+  // Note: Currently, CenterNet supports a weak instance segmentation, where
+  // semantic segmentation masks are estimated, and then cropped based on
+  // bounding box detections. Therefore, it is possible for the same image
+  // pixel to be assigned to multiple instances.
+  message MaskEstimation {
+    // Weight of the task loss. The total loss of the model will be their
+    // summation of task losses weighted by the weights.
+    optional float task_loss_weight = 1 [default = 1.0];
+
+    // Classification loss configuration for segmentation loss.
+    optional ClassificationLoss classification_loss = 2;
+
+    // Each instance mask (one per detection) is cropped and resized (bilinear
+    // resampling) from the predicted segmentation feature map. After
+    // resampling, the masks are binarized with the provided score threshold.
+    optional int32 mask_height = 4 [default = 256];
+    optional int32 mask_width = 5 [default = 256];
+    optional float score_threshold = 6 [default = 0.5];
+
+    // The initial bias value of the convlution kernel of the class heatmap
+    // prediction head. -2.19 corresponds to predicting foreground with
+    // a probability of 0.1.
+    optional float heatmap_bias_init = 3 [default = -2.19];
+  }
+  optional MaskEstimation mask_estimation_task = 8;
+}
+
+message CenterNetFeatureExtractor {
+  optional string type = 1;
+
+  // Channel means to be subtracted from each image channel. If not specified,
+  // we use a default value of 0.
+  repeated float channel_means = 2;
+
+  // Channel standard deviations. Each channel will be normalized by dividing
+  // it by its standard deviation. If not specified, we use a default value
+  // of 1.
+  repeated float channel_stds = 3;
+
+  // If set, will change channel order to be [blue, green, red]. This can be
+  // useful to be compatible with some pre-trained feature extractors.
+  optional bool bgr_ordering = 4 [default = false];
+}
--- a/research/object_detection/protos/faster_rcnn.proto
+++ b/research/object_detection/protos/faster_rcnn.proto
@@ -188,7 +188,7 @@ message Context {
  // Next id: 4

  // The maximum number of contextual features per-image, used for padding
-  optional int32 max_num_context_features = 1 [default = 8500];
+  optional int32 max_num_context_features = 1 [default = 2000];

  // The bottleneck feature dimension of the attention block.
  optional int32 attention_bottleneck_dimension = 2 [default = 2048];

--- a/research/object_detection/protos/hyperparams.proto
+++ b/research/object_detection/protos/hyperparams.proto
@@ -52,6 +52,12 @@ message Hyperparams {
  // Whether depthwise convolutions should be regularized. If this parameter is
  // NOT set then the conv hyperparams will default to the parent scope.
  optional bool regularize_depthwise = 6 [default = false];
+
+  // By default, use_bias is set to False if batch_norm is not None and
+  // batch_norm.center is True. When force_use_bias is set to True, this
+  // behavior will be overridden, and use_bias will be set to True, regardless
+  // of batch norm parameters. Note, this only applies to KerasLayerHyperparams.
+  optional bool force_use_bias = 8 [default = false];
 }

 // Proto with one-of field for regularizers.

--- a/research/object_detection/protos/input_reader.proto
+++ b/research/object_detection/protos/input_reader.proto
@@ -31,7 +31,7 @@ enum InputType {
  TF_SEQUENCE_EXAMPLE = 2;        // TfSequenceExample Input
 }

-// Next id: 31
+// Next id: 32
 message InputReader {
  // Name of input reader. Typically used to describe the dataset that is read
  // by this input reader.
@@ -119,6 +119,10 @@ message InputReader {
  // Type of instance mask.
  optional InstanceMaskType mask_type = 10 [default = NUMERICAL_MASKS];

+  // Whether to load DensePose data. If set, must also set load_instance_masks
+  // to true.
+  optional bool load_dense_pose = 31 [default = false];
+
  // Whether to use the display name when decoding examples. This is only used
  // when mapping class text strings to integers.
  optional bool use_display_name = 17 [default = false];

--- a/research/object_detection/protos/model.proto
+++ b/research/object_detection/protos/model.proto
@@ -2,6 +2,7 @@ syntax = "proto2";

 package object_detection.protos;

+import "object_detection/protos/center_net.proto";
 import "object_detection/protos/faster_rcnn.proto";
 import "object_detection/protos/ssd.proto";

@@ -17,6 +18,7 @@ message DetectionModel {
    // value to a function that builds your model.
    ExperimentalModel experimental_model = 3;

+    CenterNet center_net = 4;
  }
 }


--- a/research/object_detection/protos/preprocessor.proto
+++ b/research/object_detection/protos/preprocessor.proto
@@ -57,7 +57,8 @@ message NormalizeImage {
  optional float target_maxval = 4 [default=1];
 }

-// Randomly horizontally flips the image and detections 50% of the time.
+// Randomly horizontally flips the image and detections with the specified
+// probability, default to 50% of the time.
 message RandomHorizontalFlip {
  // Specifies a mapping from the original keypoint indices to horizontally
  // flipped indices. This is used in the event that keypoints are specified,
@@ -71,10 +72,15 @@ message RandomHorizontalFlip {
  // keypoint_flip_permutation: 3
  // keypoint_flip_permutation: 5
  // keypoint_flip_permutation: 4
+  // If nothing is specified the order of keypoint will be mantained.
  repeated int32 keypoint_flip_permutation = 1;
+
+  // The probability of running this augmentation for each image.
+  optional float probability = 2 [default=0.5];
 }

-// Randomly vertically flips the image and detections 50% of the time.
+// Randomly vertically flips the image and detections with the specified
+// probability, default to 50% of the time.
 message RandomVerticalFlip {
  // Specifies a mapping from the original keypoint indices to vertically
  // flipped indices. This is used in the event that keypoints are specified,
@@ -89,11 +95,23 @@ message RandomVerticalFlip {
  // keypoint_flip_permutation: 5
  // keypoint_flip_permutation: 4
  repeated int32 keypoint_flip_permutation = 1;
+
+  // The probability of running this augmentation for each image.
+  optional float probability = 2 [default=0.5];
 }

 // Randomly rotates the image and detections by 90 degrees counter-clockwise
-// 50% of the time.
-message RandomRotation90 {}
+// with the specified probability, default to 50% of the time.
+message RandomRotation90 {
+  // Specifies a mapping from the original keypoint indices to 90 degree counter
+  // clockwise indices. This is used in the event that keypoints are specified,
+  // in which case when the image is rotated the keypoints might need to be
+  // permuted.
+  repeated int32 keypoint_rot_permutation = 1;
+
+  // The probability of running this augmentation for each image.
+  optional float probability = 2 [default=0.5];
+}

 // Randomly scales the values of all pixels in the image by some constant value
 // between [minval, maxval], then clip the value to a range between [0, 1.0].
@@ -457,7 +475,6 @@ message SSDRandomCropPadFixedAspectRatio {
 // Converts class logits to softmax optionally scaling the values by temperature
 // first.
 message ConvertClassLogitsToSoftmax {
-
  // Scale to use on logits before applying softmax.
  optional float temperature = 1 [default=1.0];
 }
@@ -472,12 +489,10 @@ message RandomSelfConcatImage {

 // Apply an Autoaugment policy to the image and bounding boxes.
 message AutoAugmentImage {
-
  // What AutoAugment policy to apply to the Image
  optional string policy_name = 1 [default="v0"];
 }

-
 // Randomly drops ground truth boxes for a label with some probability.
 message DropLabelProbabilistically {
  // The label that should be dropped. This corresponds to one of the entries
@@ -487,7 +502,6 @@ message DropLabelProbabilistically {
  optional float drop_probability = 2 [default = 1.0];
 }

-
 //Remap a set of labels to a new label.
 message RemapLabels {
   // Labels to be remapped.

--- a/research/object_detection/protos/train.proto
+++ b/research/object_detection/protos/train.proto
@@ -59,7 +59,8 @@ message TrainConfig {

  // Whether to load all checkpoint vars that match model variable names and
  // sizes. This option is only available if `from_detection_checkpoint` is
-  // True.
+  // True.  This option is *not* supported for TF2 --- setting it to true
+  // will raise an error.
  optional bool load_all_detection_checkpoint_vars = 19 [default = false];

  // Number of steps to train the DetectionModel for. If 0, will train the model