Merge pull request #2619 from tombstone/update_protos

update proto definitions

Merge pull request #2619 from tombstone/update_protos
update proto definitions
212bd0a6 · Jonathan Huang · GitHub · 34f1d6bf · 9adf0242 · 212bd0a6
Unverified Commit 212bd0a6 authored Oct 27, 2017 by Jonathan Huang Committed by GitHub Oct 27, 2017
15 changed files
--- a/research/object_detection/protos/BUILD
+++ b/research/object_detection/protos/BUILD
@@ -54,6 +54,17 @@ py_proto_library(
    deps = [":faster_rcnn_box_coder_proto"],
 )

+proto_library(
+    name = "keypoint_box_coder_proto",
+    srcs = ["keypoint_box_coder.proto"],
+)
+
+py_proto_library(
+    name = "keypoint_box_coder_py_pb2",
+    api_version = 2,
+    deps = [":keypoint_box_coder_proto"],
+)
+
 proto_library(
    name = "mean_stddev_box_coder_proto",
    srcs = ["mean_stddev_box_coder.proto"],
@@ -81,6 +92,7 @@ proto_library(
    srcs = ["box_coder.proto"],
    deps = [
        ":faster_rcnn_box_coder_proto",
+        ":keypoint_box_coder_proto",
        ":mean_stddev_box_coder_proto",
        ":square_box_coder_proto",
    ],

--- a/research/object_detection/protos/box_coder.proto
+++ b/research/object_detection/protos/box_coder.proto
@@ -3,6 +3,7 @@ syntax = "proto2";
 package object_detection.protos;

 import "object_detection/protos/faster_rcnn_box_coder.proto";
+import "object_detection/protos/keypoint_box_coder.proto";
 import "object_detection/protos/mean_stddev_box_coder.proto";
 import "object_detection/protos/square_box_coder.proto";

@@ -13,5 +14,6 @@ message BoxCoder {
    FasterRcnnBoxCoder faster_rcnn_box_coder = 1;
    MeanStddevBoxCoder mean_stddev_box_coder = 2;
    SquareBoxCoder square_box_coder = 3;
+    KeypointBoxCoder keypoint_box_coder = 4;
  }
 }
--- a/research/object_detection/protos/box_predictor.proto
+++ b/research/object_detection/protos/box_predictor.proto
@@ -48,6 +48,8 @@ message ConvolutionalBoxPredictor {
  // Whether to apply sigmoid to the output of class predictions.
  // TODO: Do we need this since we have a post processing module.?
  optional bool apply_sigmoid_to_scores = 9 [default = false];
+
+  optional float class_prediction_bias_init = 10 [default = 0.0];
 }

 message MaskRCNNBoxPredictor {

--- a/research/object_detection/protos/eval.proto
+++ b/research/object_detection/protos/eval.proto
@@ -38,10 +38,10 @@ message EvalConfig {
  optional bool ignore_groundtruth = 10 [default=false];

  // Use exponential moving averages of variables for evaluation.
-  // TODO: When this is false make sure the model is constructed
-  // without moving averages in restore_fn.
  optional bool use_moving_averages = 11 [default=false];

  // Whether to evaluate instance masks.
+  // Note that since there is no evaluation code currently for instance
+  // segmenation this option is unused.
  optional bool eval_instance_masks = 12 [default=false];
 }
--- a/research/object_detection/protos/faster_rcnn.proto
+++ b/research/object_detection/protos/faster_rcnn.proto
@@ -116,16 +116,34 @@ message FasterRcnn {
  // Second stage classification loss weight
  optional float second_stage_classification_loss_weight = 26 [default=1.0];

-  // If not left to default, applies hard example mining.
-  optional HardExampleMiner hard_example_miner = 27;
+  // Second stage instance mask loss weight. Note that this is only applicable
+  // when `MaskRCNNBoxPredictor` is selected for second stage and configured to
+  // predict instance masks.
+  optional float second_stage_mask_prediction_loss_weight = 27 [default=1.0];
+
+  // If not left to default, applies hard example mining only to classification
+  // and localization loss..
+  optional HardExampleMiner hard_example_miner = 28;
+
+  // Loss for second stage box classifers, supports Softmax and Sigmoid.
+  // Note that score converter must be consistent with loss type.
+  // When there are multiple labels assigned to the same boxes, recommend
+  // to use sigmoid loss and enable merge_multiple_label_boxes.
+  // If not specified, Softmax loss is used as default.
+  optional ClassificationLoss second_stage_classification_loss = 29;
 }


 message FasterRcnnFeatureExtractor {
  // Type of Faster R-CNN model (e.g., 'faster_rcnn_resnet101';
-  // See models/model_builder.py for expected types).
+  // See builders/model_builder.py for expected types).
  optional string type = 1;

  // Output stride of extracted RPN feature map.
  optional int32 first_stage_features_stride = 2 [default=16];
+
+  // Whether to update batch norm parameters during training or not.
+  // When training with a relative large batch size (e.g. 8), it could be
+  // desirable to enable batch norm update.
+  optional bool batch_norm_trainable = 3 [default=false];
 }
--- a/research/object_detection/protos/image_resizer.proto
+++ b/research/object_detection/protos/image_resizer.proto
@@ -11,6 +11,13 @@ message ImageResizer {
  }
 }

+// Enumeration type for image resizing methods provided in TensorFlow.
+enum ResizeType {
+  BILINEAR = 0; // Corresponds to tf.image.ResizeMethod.BILINEAR
+  NEAREST_NEIGHBOR = 1; // Corresponds to tf.image.ResizeMethod.NEAREST_NEIGHBOR
+  BICUBIC = 2; // Corresponds to tf.image.ResizeMethod.BICUBIC
+  AREA = 3; // Corresponds to tf.image.ResizeMethod.AREA
+}

 // Configuration proto for image resizer that keeps aspect ratio.
 message KeepAspectRatioResizer {
@@ -19,8 +26,10 @@ message KeepAspectRatioResizer {

  // Desired size of the larger image dimension in pixels.
  optional int32 max_dimension = 2 [default = 1024];
-}

+  // Desired method when resizing image.
+  optional ResizeType resize_method = 3 [default = BILINEAR];
+}

 // Configuration proto for image resizer that resizes to a fixed shape.
 message FixedShapeResizer {
@@ -29,4 +38,7 @@ message FixedShapeResizer {

  // Desired width of image in pixels.
  optional int32 width = 2 [default = 300];
+
+  // Desired method when resizing image.
+  optional ResizeType resize_method = 3 [default = BILINEAR];
 }
--- a/research/object_detection/protos/input_reader.proto
+++ b/research/object_detection/protos/input_reader.proto
@@ -49,8 +49,8 @@ message InputReader {

 // An input reader that reads TF Example protos from local TFRecord files.
 message TFRecordInputReader {
-  // Path to TFRecordFile.
-  optional string input_path = 1 [default=""];
+  // Path(s) to `TFRecordFile`s.
+  repeated string input_path = 1;
 }

 // An externally defined input reader. Users may define an extension to this

--- a/research/object_detection/protos/keypoint_box_coder.proto
+++ b/research/object_detection/protos/keypoint_box_coder.proto
+syntax = "proto2";
+
+package object_detection.protos;
+
+// Configuration proto for KeypointBoxCoder. See
+// box_coders/keypoint_box_coder.py for details.
+message KeypointBoxCoder {
+  optional int32 num_keypoints = 1;
+
+  // Scale factor for anchor encoded box center and keypoints.
+  optional float y_scale = 2 [default = 10.0];
+  optional float x_scale = 3 [default = 10.0];
+
+  // Scale factor for anchor encoded box height.
+  optional float height_scale = 4 [default = 5.0];
+
+  // Scale factor for anchor encoded box width.
+  optional float width_scale = 5 [default = 5.0];
+}
--- a/research/object_detection/protos/losses.proto
+++ b/research/object_detection/protos/losses.proto
@@ -53,6 +53,7 @@ message ClassificationLoss {
    WeightedSigmoidClassificationLoss weighted_sigmoid = 1;
    WeightedSoftmaxClassificationLoss weighted_softmax = 2;
    BootstrappedSigmoidClassificationLoss bootstrapped_sigmoid = 3;
+    SigmoidFocalClassificationLoss weighted_sigmoid_focal = 4;
  }
 }

@@ -62,10 +63,23 @@ message WeightedSigmoidClassificationLoss {
  optional bool anchorwise_output = 1 [default=false];
 }

+// Sigmoid Focal cross entropy loss as described in
+// https://arxiv.org/abs/1708.02002
+message SigmoidFocalClassificationLoss {
+  optional bool anchorwise_output = 1 [default = false];
+  // modulating factor for the loss.
+  optional float gamma = 2 [default = 2.0];
+  // alpha weighting factor for the loss.
+  optional float alpha = 3;
+}
+
 // Classification loss using a softmax function over class predictions.
 message WeightedSoftmaxClassificationLoss {
  // Output loss per anchor.
  optional bool anchorwise_output = 1 [default=false];
+  // Scale logit (input) value before calculating softmax classification loss.
+  // Typically used for softmax distillation.
+  optional float logit_scale = 2 [default = 1.0];
 }

 // Classification loss using a sigmoid function over the class prediction with

--- a/research/object_detection/protos/optimizer.proto
+++ b/research/object_detection/protos/optimizer.proto
@@ -12,24 +12,24 @@ message Optimizer {
    MomentumOptimizer momentum_optimizer = 2;
    AdamOptimizer adam_optimizer = 3;
  }
-  optional bool use_moving_average = 4 [default=true];
-  optional float moving_average_decay = 5 [default=0.9999];
+  optional bool use_moving_average = 4 [default = true];
+  optional float moving_average_decay = 5 [default = 0.9999];
 }

 // Configuration message for the RMSPropOptimizer
 // See: https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
 message RMSPropOptimizer {
  optional LearningRate learning_rate = 1;
-  optional float momentum_optimizer_value = 2 [default=0.9];
-  optional float decay = 3 [default=0.9];
-  optional float epsilon = 4 [default=1.0];
+  optional float momentum_optimizer_value = 2 [default = 0.9];
+  optional float decay = 3 [default = 0.9];
+  optional float epsilon = 4 [default = 1.0];
 }

 // Configuration message for the MomentumOptimizer
 // See: https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer
 message MomentumOptimizer {
  optional LearningRate learning_rate = 1;
-  optional float momentum_optimizer_value = 2 [default=0.9];
+  optional float momentum_optimizer_value = 2 [default = 0.9];
 }

 // Configuration message for the AdamOptimizer
@@ -44,30 +44,40 @@ message LearningRate {
    ConstantLearningRate constant_learning_rate = 1;
    ExponentialDecayLearningRate exponential_decay_learning_rate = 2;
    ManualStepLearningRate manual_step_learning_rate = 3;
+    CosineDecayLearningRate cosine_decay_learning_rate = 4;
  }
 }

 // Configuration message for a constant learning rate.
 message ConstantLearningRate {
-  optional float learning_rate = 1 [default=0.002];
+  optional float learning_rate = 1 [default = 0.002];
 }

 // Configuration message for an exponentially decaying learning rate.
 // See https://www.tensorflow.org/versions/master/api_docs/python/train/ \
 //     decaying_the_learning_rate#exponential_decay
 message ExponentialDecayLearningRate {
-  optional float initial_learning_rate = 1 [default=0.002];
-  optional uint32 decay_steps = 2 [default=4000000];
-  optional float decay_factor = 3 [default=0.95];
-  optional bool staircase = 4 [default=true];
+  optional float initial_learning_rate = 1 [default = 0.002];
+  optional uint32 decay_steps = 2 [default = 4000000];
+  optional float decay_factor = 3 [default = 0.95];
+  optional bool staircase = 4 [default = true];
 }

 // Configuration message for a manually defined learning rate schedule.
 message ManualStepLearningRate {
-  optional float initial_learning_rate = 1 [default=0.002];
+  optional float initial_learning_rate = 1 [default = 0.002];
  message LearningRateSchedule {
    optional uint32 step = 1;
-    optional float learning_rate = 2 [default=0.002];
+    optional float learning_rate = 2 [default = 0.002];
  }
  repeated LearningRateSchedule schedule = 2;
 }
+
+// Configuration message for a cosine decaying learning rate as defined in
+// object_detection/utils/learning_schedules.py
+message CosineDecayLearningRate {
+  optional float learning_rate_base = 1 [default = 0.002];
+  optional uint32 total_steps = 2 [default = 4000000];
+  optional float warmup_learning_rate = 3 [default = 0.0002];
+  optional uint32 warmup_steps = 4 [default = 10000];
+}
--- a/research/object_detection/protos/post_processing.proto
+++ b/research/object_detection/protos/post_processing.proto
@@ -39,4 +39,8 @@ message PostProcessing {

  // Score converter to use.
  optional ScoreConverter score_converter = 2 [default = IDENTITY];
+  // Scale logit (input) value before conversion in post-processing step.
+  // Typically used for softmax distillation, though can be used to scale for
+  // other reasons.
+  optional float logit_scale = 3 [default = 1.0];
 }
--- a/research/object_detection/protos/preprocessor.proto
+++ b/research/object_detection/protos/preprocessor.proto
@@ -29,6 +29,9 @@ message PreprocessingStep {
    SSDRandomCrop ssd_random_crop = 21;
    SSDRandomCropPad ssd_random_crop_pad = 22;
    SSDRandomCropFixedAspectRatio ssd_random_crop_fixed_aspect_ratio = 23;
+    SSDRandomCropPadFixedAspectRatio ssd_random_crop_pad_fixed_aspect_ratio = 24;
+    RandomVerticalFlip random_vertical_flip = 25;
+    RandomRotation90 random_rotation90 = 26;
  }
 }

@@ -42,10 +45,44 @@ message NormalizeImage {
  optional float target_maxval = 4 [default=1];
 }

-// Randomly horizontally mirrors the image and detections 50% of the time.
+// Randomly horizontally flips the image and detections 50% of the time.
 message RandomHorizontalFlip {
+  // Specifies a mapping from the original keypoint indices to horizontally
+  // flipped indices. This is used in the event that keypoints are specified,
+  // in which case when the image is horizontally flipped the keypoints will
+  // need to be permuted. E.g. for keypoints representing left_eye, right_eye,
+  // nose_tip, mouth, left_ear, right_ear (in that order), one might specify
+  // the keypoint_flip_permutation below:
+  // keypoint_flip_permutation: 1
+  // keypoint_flip_permutation: 0
+  // keypoint_flip_permutation: 2
+  // keypoint_flip_permutation: 3
+  // keypoint_flip_permutation: 5
+  // keypoint_flip_permutation: 4
+  repeated int32 keypoint_flip_permutation = 1;
 }

+// Randomly vertically flips the image and detections 50% of the time.
+message RandomVerticalFlip {
+  // Specifies a mapping from the original keypoint indices to vertically
+  // flipped indices. This is used in the event that keypoints are specified,
+  // in which case when the image is vertically flipped the keypoints will
+  // need to be permuted. E.g. for keypoints representing left_eye, right_eye,
+  // nose_tip, mouth, left_ear, right_ear (in that order), one might specify
+  // the keypoint_flip_permutation below:
+  // keypoint_flip_permutation: 1
+  // keypoint_flip_permutation: 0
+  // keypoint_flip_permutation: 2
+  // keypoint_flip_permutation: 3
+  // keypoint_flip_permutation: 5
+  // keypoint_flip_permutation: 4
+  repeated int32 keypoint_flip_permutation = 1;
+}
+
+// Randomly rotates the image and detections by 90 degrees counter-clockwise
+// 50% of the time.
+message RandomRotation90 {}
+
 // Randomly scales the values of all pixels in the image by some constant value
 // between [minval, maxval], then clip the value to a range between [0, 1.0].
 message RandomPixelValueScale {
@@ -324,3 +361,45 @@ message SSDRandomCropFixedAspectRatio {
  // Aspect ratio to crop to. This value is used for all crop operations.
  optional float aspect_ratio = 2 [default=1.0];
 }
+
+message SSDRandomCropPadFixedAspectRatioOperation {
+  // Cropped image must cover at least this fraction of one original bounding
+  // box.
+  optional float min_object_covered = 1;
+
+  // The aspect ratio of the cropped image must be within the range of
+  // [min_aspect_ratio, max_aspect_ratio].
+  optional float min_aspect_ratio = 2;
+  optional float max_aspect_ratio = 3;
+
+  // The area of the cropped image must be within the range of
+  // [min_area, max_area].
+  optional float min_area = 4;
+  optional float max_area = 5;
+
+  // Cropped box area ratio must be above this threhold to be kept.
+  optional float overlap_thresh = 6;
+
+  // Probability a crop operation is skipped.
+  optional float random_coef = 7;
+
+  // Min ratio of padded image height and width to the input image's height and
+  // width. Two entries per operation.
+  repeated float min_padded_size_ratio = 8;
+
+  // Max ratio of padded image height and width to the input image's height and
+  // width. Two entries per operation.
+  repeated float max_padded_size_ratio = 9;
+}
+
+// Randomly crops and pads an image to a fixed aspect ratio according to:
+//     Liu et al., SSD: Single shot multibox detector.
+// Multiple SSDRandomCropPadFixedAspectRatioOperations are defined by this
+// preprocessing step. Only one operation (chosen at random) is actually
+// performed on an image.
+message SSDRandomCropPadFixedAspectRatio {
+  repeated SSDRandomCropPadFixedAspectRatioOperation operations = 1;
+
+  // Aspect ratio to pad to. This value is used for all crop and pad operations.
+  optional float aspect_ratio = 2 [default=1.0];
+}
--- a/research/object_detection/protos/ssd.proto
+++ b/research/object_detection/protos/ssd.proto
@@ -62,4 +62,24 @@ message SsdFeatureExtractor {

  // Hyperparameters for the feature extractor.
  optional Hyperparams conv_hyperparams = 4;
+
+  // The nearest multiple to zero-pad the input height and width dimensions to.
+  // For example, if pad_to_multiple = 2, input dimensions are zero-padded
+  // until the resulting dimensions are even.
+  optional int32 pad_to_multiple = 5 [default = 1];
+
+  // Whether to update batch norm parameters during training or not.
+  // When training with a relative small batch size (e.g. 1), it is
+  // desirable to disable batch norm update and use pretrained batch norm
+  // params.
+  //
+  // Note: Some feature extractors are used with canned arg_scopes
+  // (e.g resnet arg scopes).  In these cases training behavior of batch norm
+  // variables may depend on both values of `batch_norm_trainable` and
+  // `is_training`.
+  //
+  // When canned arg_scopes are used with feature extractors `conv_hyperparams`
+  // will apply only to the additional layers that are added and are outside the
+  // canned arg_scope.
+  optional bool batch_norm_trainable = 6 [default=true];
 }
--- a/research/object_detection/protos/ssd_anchor_generator.proto
+++ b/research/object_detection/protos/ssd_anchor_generator.proto
@@ -15,11 +15,41 @@ message SsdAnchorGenerator {
  // Scale of anchors corresponding to coarsest resolution
  optional float max_scale = 3 [default = 0.95];

+  // Can be used to override min_scale->max_scale, with an explicitly defined
+  // set of scales.  If empty, then min_scale->max_scale is used.
+  repeated float scales = 12;
+
  // Aspect ratios for anchors at each grid point.
  repeated float aspect_ratios = 4;

+  // When this aspect ratio is greater than 0, then an additional
+  // anchor, with an interpolated scale is added with this aspect ratio.
+  optional float interpolated_scale_aspect_ratio = 13 [default = 1.0];
+
  // Whether to use the following aspect ratio and scale combination for the
  // layer with the finest resolution : (scale=0.1, aspect_ratio=1.0),
  // (scale=min_scale, aspect_ration=2.0), (scale=min_scale, aspect_ratio=0.5).
  optional bool reduce_boxes_in_lowest_layer = 5 [default = true];
+
+  // The base anchor size in height dimension.
+  optional float base_anchor_height = 6 [default = 1.0];
+
+  // The base anchor size in width dimension.
+  optional float base_anchor_width = 7 [default = 1.0];
+
+  // Anchor stride in height dimension in pixels for each layer. The length of
+  // this field is expected to be equal to the value of num_layers.
+  repeated int32 height_stride = 8;
+
+  // Anchor stride in width dimension in pixels for each layer. The length of
+  // this field is expected to be equal to the value of num_layers.
+  repeated int32 width_stride = 9;
+
+  // Anchor height offset in pixels for each layer. The length of this field is
+  // expected to be equal to the value of num_layers.
+  repeated int32 height_offset = 10;
+
+  // Anchor width offset in pixels for each layer. The length of this field is
+  // expected to be equal to the value of num_layers.
+  repeated int32 width_offset = 11;
 }
--- a/research/object_detection/protos/train.proto
+++ b/research/object_detection/protos/train.proto
@@ -54,11 +54,16 @@ message TrainConfig {
  optional int32 replicas_to_aggregate = 13 [default=1];

  // Maximum number of elements to store within a queue.
-  optional int32 batch_queue_capacity = 14 [default=600];
+  optional int32 batch_queue_capacity = 14 [default=150];

  // Number of threads to use for batching.
  optional int32 num_batch_queue_threads = 15 [default=8];

  // Maximum capacity of the queue used to prefetch assembled batches.
-  optional int32 prefetch_queue_capacity = 16 [default=10];
+  optional int32 prefetch_queue_capacity = 16 [default=5];
+
+  // If true, boxes with the same coordinates will be merged together.
+  // This is useful when each box can have multiple labels.
+  // Note that only Sigmoid classification losses should be used.
+  optional bool merge_multiple_label_boxes = 17 [default=false];
 }