faster_rcnn.proto 11.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
syntax = "proto2";

package object_detection.protos;

import "object_detection/protos/anchor_generator.proto";
import "object_detection/protos/box_predictor.proto";
import "object_detection/protos/hyperparams.proto";
import "object_detection/protos/image_resizer.proto";
import "object_detection/protos/losses.proto";
import "object_detection/protos/post_processing.proto";
syiming's avatar
syiming committed
11
import "object_detection/protos/fpn.proto";
12
13
14
15
16
17
18
19
20

// Configuration for Faster R-CNN models.
// See meta_architectures/faster_rcnn_meta_arch.py and models/model_builder.py
//
// Naming conventions:
// Faster R-CNN models have two stages: a first stage region proposal network
// (or RPN) and a second stage box classifier.  We thus use the prefixes
// `first_stage_` and `second_stage_` to indicate the stage to which each
// parameter pertains when relevant.
21

22
23
message FasterRcnn {
  // Whether to construct only the Region Proposal Network (RPN).
24
  optional int32 number_of_stages = 1 [default = 2];
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

  // Number of classes to predict.
  optional int32 num_classes = 3;

  // Image resizer for preprocessing the input image.
  optional ImageResizer image_resizer = 4;

  // Feature extractor config.
  optional FasterRcnnFeatureExtractor feature_extractor = 5;

  // (First stage) region proposal network (RPN) parameters.

  // Anchor generator to compute RPN anchors.
  optional AnchorGenerator first_stage_anchor_generator = 6;

  // Atrous rate for the convolution op applied to the
  // `first_stage_features_to_crop` tensor to obtain box predictions.
42
  optional int32 first_stage_atrous_rate = 7 [default = 1];
43
44
45
46
47
48

  // Hyperparameters for the convolutional RPN box predictor.
  optional Hyperparams first_stage_box_predictor_conv_hyperparams = 8;

  // Kernel size to use for the convolution op just prior to RPN box
  // predictions.
49
  optional int32 first_stage_box_predictor_kernel_size = 9 [default = 3];
50
51

  // Output depth for the convolution op just prior to RPN box predictions.
52
  optional int32 first_stage_box_predictor_depth = 10 [default = 512];
53
54
55

  // The batch size to use for computing the first stage objectness and
  // location losses.
56
  optional int32 first_stage_minibatch_size = 11 [default = 256];
57
58

  // Fraction of positive examples per image for the RPN.
59
  optional float first_stage_positive_balance_fraction = 12 [default = 0.5];
60
61

  // Non max suppression score threshold applied to first stage RPN proposals.
62
  optional float first_stage_nms_score_threshold = 13 [default = 0.0];
63
64

  // Non max suppression IOU threshold applied to first stage RPN proposals.
65
  optional float first_stage_nms_iou_threshold = 14 [default = 0.7];
66
67

  // Maximum number of RPN proposals retained after first stage postprocessing.
68
  optional int32 first_stage_max_proposals = 15 [default = 300];
69
70

  // First stage RPN localization loss weight.
71
  optional float first_stage_localization_loss_weight = 16 [default = 1.0];
72
73

  // First stage RPN objectness loss weight.
74
  optional float first_stage_objectness_loss_weight = 17 [default = 1.0];
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

  // Per-region cropping parameters.
  // Note that if a R-FCN model is constructed the per region cropping
  // parameters below are ignored.

  // Output size (width and height are set to be the same) of the initial
  // bilinear interpolation based cropping during ROI pooling.
  optional int32 initial_crop_size = 18;

  // Kernel size of the max pool op on the cropped feature map during
  // ROI pooling.
  optional int32 maxpool_kernel_size = 19;

  // Stride of the max pool op on the cropped feature map during ROI pooling.
  optional int32 maxpool_stride = 20;

  // (Second stage) box classifier parameters

  // Hyperparameters for the second stage box predictor. If box predictor type
  // is set to rfcn_box_predictor, a R-FCN model is constructed, otherwise a
  // Faster R-CNN model is constructed.
96
  optional BoxPredictor second_stage_box_predictor = 21;
97
98
99
100

  // The batch size per image used for computing the classification and refined
  // location loss of the box classifier.
  // Note that this field is ignored if `hard_example_miner` is configured.
101
  optional int32 second_stage_batch_size = 22 [default = 64];
102
103

  // Fraction of positive examples to use per image for the box classifier.
104
  optional float second_stage_balance_fraction = 23 [default = 0.25];
105
106
107
108
109
110
111

  // Post processing to apply on the second stage box classifier predictions.
  // Note: the `score_converter` provided to the FasterRCNNMetaArch constructor
  // is taken from this `second_stage_post_processing` proto.
  optional PostProcessing second_stage_post_processing = 24;

  // Second stage refined localization loss weight.
112
  optional float second_stage_localization_loss_weight = 25 [default = 1.0];
113
114

  // Second stage classification loss weight
115
  optional float second_stage_classification_loss_weight = 26 [default = 1.0];
116

Vivek Rathod's avatar
Vivek Rathod committed
117
118
119
  // Second stage instance mask loss weight. Note that this is only applicable
  // when `MaskRCNNBoxPredictor` is selected for second stage and configured to
  // predict instance masks.
120
  optional float second_stage_mask_prediction_loss_weight = 27 [default = 1.0];
Vivek Rathod's avatar
Vivek Rathod committed
121
122
123
124
125
126
127
128
129
130
131

  // If not left to default, applies hard example mining only to classification
  // and localization loss..
  optional HardExampleMiner hard_example_miner = 28;

  // Loss for second stage box classifers, supports Softmax and Sigmoid.
  // Note that score converter must be consistent with loss type.
  // When there are multiple labels assigned to the same boxes, recommend
  // to use sigmoid loss and enable merge_multiple_label_boxes.
  // If not specified, Softmax loss is used as default.
  optional ClassificationLoss second_stage_classification_loss = 29;
132
133
134
135
136
137

  // Whether to update batch_norm inplace during training. This is required
  // for batch norm to work correctly on TPUs. When this is false, user must add
  // a control dependency on tf.GraphKeys.UPDATE_OPS for train/loss op in order
  // to update the batch norm moving average parameters.
  optional bool inplace_batchnorm_update = 30 [default = false];
138
139
140
141
142

  // Force the use of matrix multiplication based crop and resize instead of
  // standard tf.image.crop_and_resize while computing second stage input
  // feature maps.
  optional bool use_matmul_crop_and_resize = 31 [default = false];
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

  // Normally, anchors generated for a given image size are pruned during
  // training if they lie outside the image window. Setting this option to true,
  // clips the anchors to be within the image instead of pruning.
  optional bool clip_anchors_to_image = 32 [default = false];

  // After peforming matching between anchors and targets, in order to pull out
  // targets for training Faster R-CNN meta architecture we perform a gather
  // operation. This options specifies whether to use an alternate
  // implementation of tf.gather that is faster on TPUs.
  optional bool use_matmul_gather_in_matcher = 33 [default = false];

  // Whether to use the balanced positive negative sampler implementation with
  // static shape guarantees.
  optional bool use_static_balanced_label_sampler = 34 [default = false];
158
159
160
161
162
163
164

  // If True, uses implementation of ops with static shape guarantees.
  optional bool use_static_shapes = 35 [default = false];

  // Whether the masks present in groundtruth should be resized in the model to
  // match the image size.
  optional bool resize_masks = 36 [default = true];
165
166
167
168

  // If True, uses implementation of ops with static shape guarantees when
  // running evaluation (specifically not is_training if False).
  optional bool use_static_shapes_for_eval = 37 [default = false];
Pooya Davoodi's avatar
Pooya Davoodi committed
169

170
171
172
173
174
175
176
  // If true, uses implementation of partitioned_non_max_suppression in first
  // stage.
  optional bool use_partitioned_nms_in_first_stage = 38 [default = true];

  // Whether to return raw detections (pre NMS).
  optional bool return_raw_detections_during_predict = 39 [default = false];

Pooya Davoodi's avatar
Pooya Davoodi committed
177
  // Whether to use tf.image.combined_non_max_suppression.
178
  optional bool use_combined_nms_in_first_stage = 40 [default = false];
179

180
181
182
  // Whether to output final box feature. If true, it will crop the rpn feature
  // map based on the final prediction boxes, then pass the crops through the
  // box_classifier to compute the final features in the postprocess() method.
183
184
  optional bool output_final_box_features = 42 [default = false];

185
186
187
188
189
  // Whether to output final box rpn features. If true, it will crop the rpn
  // feature map in the postprocess() method based on the final prediction
  // boxes.
  optional bool output_final_box_rpn_features = 43 [default = false];

190
191
192
193
  // Configs for context model.
  optional Context context_config = 41;
}

194
195
196
197
198
199
200
// Input type format: whether inputs are TfExamples or TfSequenceExamples.
enum AttentionPosition {
  ATTENTION_DEFAULT = 0;        // Default, currently post box classifier
  POST_BOX_CLASSIFIER = 1;      // Post box classifier
  POST_RPN = 2;                 // Post RPN, pre box classifier
}

201
message Context {
202
203
  // Configuration proto for Context R-CNN.
  // Next id: 12
204
205

  // The maximum number of contextual features per-image, used for padding
206
  optional int32 max_num_context_features = 1 [default = 2000];
207
208
209
210
211
212
213
214
215

  // The bottleneck feature dimension of the attention block.
  optional int32 attention_bottleneck_dimension = 2 [default = 2048];

  // The attention temperature.
  optional float attention_temperature = 3 [default = 0.01];

  // The context feature length.
  optional int32 context_feature_length = 4 [default = 2057];
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239

  // Whether to use self-attention from box proposals to themselves, TF1 only.
  optional bool use_self_attention = 6 [default = false];

  // Whether to use attention into context features, setting to false is only
  // implemented in TF1.
  optional bool use_long_term_attention = 7 [default = true];

  // Whether the self-attention block and the long term attention block should
  // be in sequence or parallel, ie whether the outputs of the self-attention
  // block should be the inputs into the long term attention block (sequence)
  // or whether the self attention block and long term attention block should
  // happen in parallel, with outputs summed.
  optional bool self_attention_in_sequence = 8 [default = false];

  // Number of attention heads
  optional int32 num_attention_heads = 9 [default = 1];

  // Number of attention layers
  optional int32 num_attention_layers = 11 [default = 1];

  // Where the attention goes, 0 is pre-second-stage, 1 is post-second-stage
  optional AttentionPosition attention_position = 10 [
    default = POST_BOX_CLASSIFIER];
240
241
242
243
}

message FasterRcnnFeatureExtractor {
  // Type of Faster R-CNN model (e.g., 'faster_rcnn_resnet101';
Vivek Rathod's avatar
Vivek Rathod committed
244
  // See builders/model_builder.py for expected types).
245
246
247
  optional string type = 1;

  // Output stride of extracted RPN feature map.
248
  optional int32 first_stage_features_stride = 2 [default = 16];
Vivek Rathod's avatar
Vivek Rathod committed
249
250
251
252

  // Whether to update batch norm parameters during training or not.
  // When training with a relative large batch size (e.g. 8), it could be
  // desirable to enable batch norm update.
253
  optional bool batch_norm_trainable = 3 [default = false];
syiming's avatar
Merge  
syiming committed
254
255
256
257
258
259
260

  // Hyperparameters that affect the layers of feature extractor added on top
  // of the base feature extractor.
  optional Hyperparams conv_hyperparams = 4;

  // if the value is set to true, the base feature extractor's hyperparams will
  // be overridden with the `conv_hyperparams`.
syiming's avatar
syiming committed
261
  optional bool override_base_feature_extractor_hyperparams = 5
syiming's avatar
Merge  
syiming committed
262
263
      [default = false];

syiming's avatar
syiming committed
264
265
266
267
268
  // The nearest multiple to zero-pad the input height and width dimensions to.
  // For example, if pad_to_multiple = 2, input dimensions are zero-padded
  // until the resulting dimensions are even.
  optional int32 pad_to_multiple = 6 [default = 32];

syiming's avatar
Merge  
syiming committed
269
  // Feature Pyramid Networks config.
syiming's avatar
syiming committed
270
  optional FeaturePyramidNetworks fpn = 7;
271
}