ssd.proto 10.1 KB
Newer Older
1
syntax = "proto2";
2

3
4
5
6
7
8
9
10
package object_detection.protos;

import "object_detection/protos/anchor_generator.proto";
import "object_detection/protos/box_coder.proto";
import "object_detection/protos/box_predictor.proto";
import "object_detection/protos/hyperparams.proto";
import "object_detection/protos/image_resizer.proto";
import "object_detection/protos/losses.proto";
11
import "object_detection/protos/matcher.proto";
12
13
14
15
import "object_detection/protos/post_processing.proto";
import "object_detection/protos/region_similarity_calculator.proto";

// Configuration for Single Shot Detection (SSD) models.
16
// Next id: 27
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
message Ssd {
  // Number of classes to predict.
  optional int32 num_classes = 1;

  // Image resizer for preprocessing the input image.
  optional ImageResizer image_resizer = 2;

  // Feature extractor config.
  optional SsdFeatureExtractor feature_extractor = 3;

  // Box coder to encode the boxes.
  optional BoxCoder box_coder = 4;

  // Matcher to match groundtruth with anchors.
  optional Matcher matcher = 5;

  // Region similarity calculator to compute similarity of boxes.
  optional RegionSimilarityCalculator similarity_calculator = 6;

36
37
  // Whether background targets are to be encoded as an all
  // zeros vector or a one-hot vector (where background is the 0th class).
38
  optional bool encode_background_as_zeros = 12 [default = false];
39

40
41
42
43
  // classification weight to be associated to negative
  // anchors (default: 1.0). The weight must be in [0., 1.].
  optional float negative_class_weight = 13 [default = 1.0];

44
45
46
47
48
49
50
51
52
53
54
  // Box predictor to attach to the features.
  optional BoxPredictor box_predictor = 7;

  // Anchor generator to compute anchors.
  optional AnchorGenerator anchor_generator = 8;

  // Post processing to apply on the predictions.
  optional PostProcessing post_processing = 9;

  // Whether to normalize the loss by number of groundtruth boxes that match to
  // the anchors.
55
  optional bool normalize_loss_by_num_matches = 10 [default = true];
56

57
58
  // Whether to normalize the localization loss by the code size of the box
  // encodings. This is applied along with other normalization factors.
59
  optional bool normalize_loc_loss_by_codesize = 14 [default = false];
60

61
62
  // Loss configuration for training.
  optional Loss loss = 11;
63

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  // Whether to update batch norm parameters during training or not.
  // When training with a relative small batch size (e.g. 1), it is
  // desirable to disable batch norm update and use pretrained batch norm
  // params.
  //
  // Note: Some feature extractors are used with canned arg_scopes
  // (e.g resnet arg scopes).  In these cases training behavior of batch norm
  // variables may depend on both values of `batch_norm_trainable` and
  // `is_training`.
  //
  // When canned arg_scopes are used with feature extractors `conv_hyperparams`
  // will apply only to the additional layers that are added and are outside the
  // canned arg_scope.
  optional bool freeze_batchnorm = 16 [default = false];

79
80
81
82
83
  // Whether to update batch_norm inplace during training. This is required
  // for batch norm to work correctly on TPUs. When this is false, user must add
  // a control dependency on tf.GraphKeys.UPDATE_OPS for train/loss op in order
  // to update the batch norm moving average parameters.
  optional bool inplace_batchnorm_update = 15 [default = false];
84

85
86
87
88
  // Whether to add an implicit background class to one-hot encodings of
  // groundtruth labels. Set to false if training a single
  // class model or using an explicit background class.
  optional bool add_background_class = 21 [default = true];
89

90
91
92
93
  // Whether to use an explicit background class. Set to true if using
  // groundtruth labels with an explicit background class, as in multiclass
  // scores.
  optional bool explicit_background_class = 24 [default = false];
94

95
  optional bool use_confidences_as_targets = 22 [default = false];
96

97
  optional float implicit_example_weight = 23 [default = 1.0];
98

99
100
  optional bool return_raw_detections_during_predict = 26 [default = false];

101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
  // Configuration proto for MaskHead.
  // Next id: 11
  message MaskHead {
    // The height and the width of the predicted mask. Only used when
    // predict_instance_masks is true.
    optional int32 mask_height = 1 [default = 15];
    optional int32 mask_width = 2 [default = 15];

    // Whether to predict class agnostic masks. Only used when
    // predict_instance_masks is true.
    optional bool masks_are_class_agnostic = 3 [default = true];

    // The depth for the first conv2d_transpose op applied to the
    // image_features in the mask prediction branch. If set to 0, the value
    // will be set automatically based on the number of channels in the image
    // features and the number of classes.
    optional int32 mask_prediction_conv_depth = 4 [default = 256];

119
120
    // The number of convolutions applied to image_features in the mask
    // prediction branch.
121
    optional int32 mask_prediction_num_conv_layers = 5 [default = 2];
122

123
124
125
126
127
128
129
    // Whether to apply convolutions on mask features before upsampling using
    // nearest neighbor resizing.
    // By default, mask features are resized to [`mask_height`, `mask_width`]
    // before applying convolutions and predicting masks.
    optional bool convolve_then_upsample_masks = 6 [default = false];

    // Mask loss weight.
130
    optional float mask_loss_weight = 7 [default = 5.0];
131
132

    // Number of boxes to be generated at training time for computing mask loss.
133
    optional int32 mask_loss_sample_size = 8 [default = 16];
134
135
136
137
138
139
140
141
142
143
144
145
146

    // Hyperparameters for convolution ops used in the box predictor.
    optional Hyperparams conv_hyperparams = 9;

    // Output size (width and height are set to be the same) of the initial
    // bilinear interpolation based cropping during ROI pooling. Only used when
    // we have second stage prediction head enabled (e.g. mask head).
    optional int32 initial_crop_size = 10 [default = 15];
  }

  // Configs for mask head.
  optional MaskHead mask_head_config = 25;
}
147

148
// Next id: 20.
149
message SsdFeatureExtractor {
150
151
  reserved 6;

152
153
154
155
  // Type of ssd feature extractor.
  optional string type = 1;

  // The factor to alter the depth of the channels in the feature extractor.
156
  optional float depth_multiplier = 2 [default = 1.0];
157
158

  // Minimum number of the channels in the feature extractor.
159
  optional int32 min_depth = 3 [default = 16];
160

161
162
  // Hyperparameters that affect the layers of feature extractor added on top
  // of the base feature extractor.
163
  optional Hyperparams conv_hyperparams = 4;
Vivek Rathod's avatar
Vivek Rathod committed
164

165
166
167
168
169
170
  // Normally, SSD feature extractors are constructed by reusing an existing
  // base feature extractor (that has its own hyperparams) and adding new layers
  // on top of it. `conv_hyperparams` above normally applies only to the new
  // layers while base feature extractor uses its own default hyperparams. If
  // this value is set to true, the base feature extractor's hyperparams will be
  // overridden with the `conv_hyperparams`.
171
172
  optional bool override_base_feature_extractor_hyperparams = 9
      [default = false];
173

Vivek Rathod's avatar
Vivek Rathod committed
174
175
176
177
178
  // The nearest multiple to zero-pad the input height and width dimensions to.
  // For example, if pad_to_multiple = 2, input dimensions are zero-padded
  // until the resulting dimensions are even.
  optional int32 pad_to_multiple = 5 [default = 1];

179
  // Whether to use explicit padding when extracting SSD multiresolution
180
181
  // features. This will also apply to the base feature extractor if a MobileNet
  // architecture is used.
182
  optional bool use_explicit_padding = 7 [default = false];
183
184
185

  // Whether to use depthwise separable convolutions for to extract additional
  // feature maps added by SSD.
186
  optional bool use_depthwise = 8 [default = false];
187

188
189
190
191
192
193
194
  oneof feature_pyramid_oneof {
    // Feature Pyramid Networks config.
    FeaturePyramidNetworks fpn = 10;

    // Bidirectional Feature Pyramid Networks config.
    BidirectionalFeaturePyramidNetworks bifpn = 19;
  }
195
196
197
198
199

  // If true, replace preprocess function of feature extractor with a
  // placeholder. This should only be used if all the image preprocessing steps
  // happen outside the graph.
  optional bool replace_preprocessor_with_placeholder = 11 [default = false];
pkulzc's avatar
pkulzc committed
200
201
202

  // The number of SSD layers.
  optional int32 num_layers = 12 [default = 6];
203

204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
}

// Configuration for Feature Pyramid Networks.
message FeaturePyramidNetworks {
  // We recommend to use multi_resolution_feature_map_generator with FPN, and
  // the levels there must match the levels defined below for better
  // performance.
  // Correspondence from FPN levels to Resnet/Mobilenet V1 feature maps:
  // FPN Level        Resnet Feature Map      Mobilenet-V1 Feature Map
  //     2               Block 1                Conv2d_3_pointwise
  //     3               Block 2                Conv2d_5_pointwise
  //     4               Block 3                Conv2d_11_pointwise
  //     5               Block 4                Conv2d_13_pointwise
  //     6               Bottomup_5             bottom_up_Conv2d_14
  //     7               Bottomup_6             bottom_up_Conv2d_15
  //     8               Bottomup_7             bottom_up_Conv2d_16
  //     9               Bottomup_8             bottom_up_Conv2d_17

  // minimum level in feature pyramid
  optional int32 min_level = 1 [default = 3];

  // maximum level in feature pyramid
  optional int32 max_level = 2 [default = 7];
227
228
229

  // channel depth for additional coarse feature layers.
  optional int32 additional_layer_depth = 3 [default = 256];
230

231
}
232

233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
// Configuration for Bidirectional Feature Pyramid Networks.
message BidirectionalFeaturePyramidNetworks {
  // minimum level in the feature pyramid.
  optional int32 min_level = 1 [default = 3];

  // maximum level in the feature pyramid.
  optional int32 max_level = 2 [default = 7];

  // The number of repeated top-down bottom-up iterations for BiFPN-based
  // feature extractors (bidirectional feature pyramid networks).
  optional int32 num_iterations = 3;

  // The number of filters (channels) to use in feature pyramid layers for
  // BiFPN-based feature extractors (bidirectional feature pyramid networks).
  optional int32 num_filters = 4;

  // Method used to combine inputs to BiFPN nodes.
  optional string combine_method = 5 [default = 'fast_attention'];
}