center_net.proto 26.7 KB
Newer Older
1
2
3
4
5
6
syntax = "proto2";

package object_detection.protos;

import "object_detection/protos/image_resizer.proto";
import "object_detection/protos/losses.proto";
7
import "object_detection/protos/post_processing.proto";
8
import "object_detection/protos/preprocessor.proto";
9
10
11
12
13

// Configuration for the CenterNet meta architecture from the "Objects as
// Points" paper [1]
// [1]: https://arxiv.org/abs/1904.07850

14
// Next Id = 26
15
16
17
18
19
20
21
22
23
24
message CenterNet {
  // Number of classes to predict.
  optional int32 num_classes = 1;

  // Feature extractor config.
  optional CenterNetFeatureExtractor feature_extractor = 2;

  // Image resizer for preprocessing the input image.
  optional ImageResizer image_resizer = 3;

25
26
27
  // If set, all task heads will be constructed with separable convolutions.
  optional bool use_depthwise = 13 [default = false];

28
29
30
31
32
33
  // Indicates whether or not to use the sparse version of the Op that computes
  // the center heatmaps. The sparse version scales better with number of
  // channels in the heatmap, but in some cases is known to cause an OOM error.
  // TODO(b/170989061) When bug is fixed, make this the default behavior.
  optional bool compute_heatmap_sparse = 15 [default = false];

34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
  // Parameters to determine the model architecture/layers of the prediction
  // heads.
  message PredictionHeadParams {
    // The two fields: num_filters, kernel_sizes correspond to the parameters of
    // the convolutional layers used by the prediction head. If provided, the
    // length of the two repeated fields need to be the same and represents the
    // number of convolutional layers.

    // Corresponds to the "filters" argument in tf.keras.layers.Conv2D. If not
    // provided, the default value [256] will be used.
    repeated int32 num_filters = 1;

    // Corresponds to the "kernel_size" argument in tf.keras.layers.Conv2D. If
    // not provided, the default value [3] will be used.
    repeated int32 kernel_sizes = 2;
  }

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
  // Parameters which are related to object detection task.
  message ObjectDetection {
    // The original fields are moved to ObjectCenterParams or deleted.
    reserved 2, 5, 6, 7;

    // Weight of the task loss. The total loss of the model will be the
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // Weight for the offset localization loss.
    optional float offset_loss_weight = 3 [default = 1.0];

    // Weight for the height/width localization loss.
    optional float scale_loss_weight = 4 [default = 0.1];

    // Localization loss configuration for object scale and offset losses.
    optional LocalizationLoss localization_loss = 8;
68
69
70
71
72
73
74
75

    // Parameters to determine the architecture of the object scale prediction
    // head.
    optional PredictionHeadParams scale_head_params = 9;

    // Parameters to determine the architecture of the object offset prediction
    // head.
    optional PredictionHeadParams offset_head_params = 10;
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
  }
  optional ObjectDetection object_detection_task = 4;

  // Parameters related to object center prediction. This is required for both
  // object detection and keypoint estimation tasks.
  message ObjectCenterParams {
    // Weight for the object center loss.
    optional float object_center_loss_weight = 1 [default = 1.0];

    // Classification loss configuration for object center loss.
    optional ClassificationLoss classification_loss = 2;

    // The initial bias value of the convlution kernel of the class heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1. See "Focal Loss for Dense Object Detection"
    // at https://arxiv.org/abs/1708.02002.
    optional float heatmap_bias_init = 3 [default = -2.19];

    // The minimum IOU overlap boxes need to have to not be penalized.
    optional float min_box_overlap_iou = 4 [default = 0.7];

    // Maximum number of boxes to predict.
    optional int32 max_box_predictions = 5 [default = 100];

    // If set, loss is only computed for the labeled classes.
    optional bool use_labeled_classes = 6 [default = false];
102
103
104
105
106
107
108
109

    // The keypoint weights used for calculating the location of object center.
    // When the field is provided, the number of weights need to be the same as
    // the number of keypoints. The object center is calculated by the weighted
    // mean of the keypoint locations. When the field is not provided, the
    // object center is determined by the bounding box groundtruth annotations
    // (default behavior).
    repeated float keypoint_weights_for_center = 7;
110
111
112
113

    // Parameters to determine the architecture of the object center prediction
    // head.
    optional PredictionHeadParams center_head_params = 8;
114
115
116
117

    // Max pool kernel size to use to pull off peak score locations in a
    // neighborhood for the object detection heatmap.
    optional int32 peak_max_pool_kernel_size = 9 [default = 3];
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
  }
  optional ObjectCenterParams object_center_params = 5;

  // Path of the file that conatins the label map along with the keypoint
  // information, including the keypoint indices, corresponding labels, and the
  // corresponding class. The file should be the same one as used in the input
  // pipeline. Note that a plain text of StringIntLabelMap proto is expected in
  // this file.
  // It is required only if the keypoint estimation task is specified.
  optional string keypoint_label_map_path = 6;

  // Parameters which are related to keypoint estimation task.
  message KeypointEstimation {
    // Name of the task, e.g. "human pose". Note that the task name should be
    // unique to each keypoint task.
    optional string task_name = 1;

    // Weight of the task loss. The total loss of the model will be their
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 2 [default = 1.0];

    // Loss configuration for keypoint heatmap, offset, regression losses. Note
    // that the localization loss is used for offset/regression losses and
    // classification loss is used for heatmap loss.
    optional Loss loss = 3;

    // The name of the class that contains the keypoints for this task. This is
    // used to retrieve the corresponding keypoint indices from the label map.
    // Note that this corresponds to the "name" field, not "display_name".
    optional string keypoint_class_name = 4;

    // The standard deviation of the Gaussian kernel used to generate the
    // keypoint heatmap. The unit is the pixel in the output image. It is to
    // provide the flexibility of using different sizes of Gaussian kernel for
    // each keypoint class. Note that if provided, the keypoint standard
    // deviations will be overridden by the specified values here, otherwise,
    // the default value 5.0 will be used.
    // TODO(yuhuic): Update the default value once we found the best value.
    map<string, float> keypoint_label_to_std = 5;

    // Loss weights corresponding to different heads.
    optional float keypoint_regression_loss_weight = 6 [default = 1.0];
    optional float keypoint_heatmap_loss_weight = 7 [default = 1.0];
    optional float keypoint_offset_loss_weight = 8 [default = 1.0];

    // The initial bias value of the convolution kernel of the keypoint heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1. See "Focal Loss for Dense Object Detection"
    // at https://arxiv.org/abs/1708.02002.
    optional float heatmap_bias_init = 9 [default = -2.19];

    // The heatmap score threshold for a keypoint to become a valid candidate.
    optional float keypoint_candidate_score_threshold = 10 [default = 0.1];

    // The maximum number of candidates to retrieve for each keypoint.
    optional int32 num_candidates_per_keypoint = 11 [default = 100];

    // Max pool kernel size to use to pull off peak score locations in a
    // neighborhood (independently for each keypoint types).
    optional int32 peak_max_pool_kernel_size = 12 [default = 3];

    // The default score to use for regressed keypoints that are not
    // successfully snapped to a nearby candidate.
    optional float unmatched_keypoint_score = 13 [default = 0.1];

    // The multiplier to expand the bounding boxes (either the provided boxes or
    // those which tightly cover the regressed keypoints). Note that new
    // expanded box for an instance becomes the feasible search window for all
    // associated keypoints.
    optional float box_scale = 14 [default = 1.2];

    // The scale parameter that multiplies the largest dimension of a bounding
    // box. The resulting distance becomes a search radius for candidates in the
    // vicinity of each regressed keypoint.
    optional float candidate_search_scale = 15 [default = 0.3];

194
195
196
    // One of ['min_distance', 'score_distance_ratio',
    // 'score_scaled_distance_ratio', 'gaussian_weighted'] indicating how to
    // select the keypoint candidate.
197
198
    optional string candidate_ranking_mode = 16 [default = "min_distance"];

199
200
201
202
203
204
    // The score distance ratio offset, only used if candidate_ranking_mode is
    // 'score_distance_ratio'. The offset is used in the maximization of score
    // distance ratio, defined as:
    // keypoint_score / (distance + score_distance_offset)
    optional float score_distance_offset = 22 [default = 1.0];

205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
    // A scalar used to multiply the bounding box size to be used as the offset
    // in the score-to-distance-ratio formula. Only applicable when the
    // candidate_ranking_mode is score_scaled_distance_ratio.
    // The keypoint candidates are ranked using the formula:
    //   ranking_score = score / (distance + offset)
    // where 'score' is the keypoint heatmap scores, 'distance' is the distance
    // between the heatmap peak location and the regressed joint location,
    // 'offset' is a function of the predicted bounding box:
    //   offset = max(bbox height, bbox width) * score_distance_multiplier
    optional float score_distance_multiplier = 28 [default = 0.1];

    // A scalar used to multiply the Gaussian standard deviation to control the
    // Gaussian kernel which is used to weight the candidates. Only applicable
    // when the candidate_ranking_mode is gaussian_weighted.
    // The keypoint candidates are ranked using the formula:
    //   scores * exp((-distances^2) / (2 * sigma^2))
    // where 'distances' is the distance between the heatmap peak location and
    // the regressed joint location and 'sigma' is the Gaussian standard
223
224
    // deviation used in generating the Gaussian heatmap target multiplied by
    // the 'std_dev_multiplier'.
225
226
    optional float std_dev_multiplier = 29 [default = 1.0];

227
228
229
230
231
232
233
234
235
236
237
238
    // The radius (in the unit of output pixel) around heatmap peak to assign
    // the offset targets. If set 0, then the offset target will only be
    // assigned to the heatmap peak (same behavior as the original paper).
    optional int32 offset_peak_radius = 17 [default = 0];

    // Indicates whether to assign offsets for each keypoint channel
    // separately. If set False, the output offset target has the shape
    // [batch_size, out_height, out_width, 2] (same behavior as the original
    // paper). If set True, the output offset target has the shape [batch_size,
    // out_height, out_width, 2 * num_keypoints] (recommended when the
    // offset_peak_radius is not zero).
    optional bool per_keypoint_offset = 18 [default = false];
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253

    // Indicates whether to predict the depth of each keypoints. Note that this
    // is only supported in the single class keypoint task.
    optional bool predict_depth = 19 [default = false];

    // Indicates whether to predict depths for each keypoint channel
    // separately. If set False, the output depth target has the shape
    // [batch_size, out_height, out_width, 1]. If set True, the output depth
    // target has the shape [batch_size, out_height, out_width,
    // num_keypoints]. Recommend to set this value and "per_keypoint_offset" to
    // both be True at the same time.
    optional bool per_keypoint_depth = 20 [default = false];

    // The weight of the keypoint depth loss.
    optional float keypoint_depth_loss_weight = 21 [default = 1.0];
254
255
256
257
258
259
260
261
262
263
264
265
266

    // Whether keypoints outside the image frame should be clipped back to the
    // image boundary. If true, the keypoints that are clipped have scores set
    // to 0.0.
    optional bool clip_out_of_frame_keypoints = 23 [default = false];

    // Whether instances should be rescored based on keypoint confidences. If
    // False, will use the detection score (from the object center heatmap). If
    // True, will compute new scores with:
    // new_score = o * (1/k) sum {s_i}
    // where o is the object score, s_i is the score for keypoint i, and k is
    // the number of keypoints for that class.
    optional bool rescore_instances = 24 [default = false];
267

268
269
270
271
272
    // A scalar used when "rescore_instances" is set to True. The detection
    // score of an instance is set to be the average score among those keypoints
    // with scores higher than the threshold.
    optional float rescoring_threshold = 30 [default = 0.0];

273
274
275
276
277
278
279
280
281
282
    // The ratio used to multiply the output feature map size to determine the
    // denominator in the Gaussian formula. Only applicable when the
    // candidate_ranking_mode is set to be 'gaussian_weighted_const'.
    optional float gaussian_denom_ratio = 31 [default = 0.1];

    // Whether to use the keypoint postprocessing logic that replaces topk op
    // with argmax. Usually used when exporting the model for predicting
    // keypoints of multiple instances in the browser.
    optional bool argmax_postprocessing = 32 [default = false];

283
284
285
286
287
288
289
290
291
292
293
    // Parameters to determine the architecture of the keypoint heatmap
    // prediction head.
    optional PredictionHeadParams heatmap_head_params = 25;

    // Parameters to determine the architecture of the keypoint offset
    // prediction head.
    optional PredictionHeadParams offset_head_params = 26;

    // Parameters to determine the architecture of the keypoint regression
    // prediction head.
    optional PredictionHeadParams regress_head_params = 27;
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
  }
  repeated KeypointEstimation keypoint_estimation_task = 7;

  // Parameters which are related to mask estimation task.
  // Note: Currently, CenterNet supports a weak instance segmentation, where
  // semantic segmentation masks are estimated, and then cropped based on
  // bounding box detections. Therefore, it is possible for the same image
  // pixel to be assigned to multiple instances.
  message MaskEstimation {
    // Weight of the task loss. The total loss of the model will be their
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // Classification loss configuration for segmentation loss.
    optional ClassificationLoss classification_loss = 2;

    // Each instance mask (one per detection) is cropped and resized (bilinear
    // resampling) from the predicted segmentation feature map. After
    // resampling, the masks are binarized with the provided score threshold.
    optional int32 mask_height = 4 [default = 256];
    optional int32 mask_width = 5 [default = 256];
    optional float score_threshold = 6 [default = 0.5];

    // The initial bias value of the convlution kernel of the class heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1.
    optional float heatmap_bias_init = 3 [default = -2.19];
321
322
323
324

    // Parameters to determine the architecture of the segmentation mask
    // prediction head.
    optional PredictionHeadParams mask_head_params = 7;
325
326
  }
  optional MaskEstimation mask_estimation_task = 8;
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361

  // Parameters which are related to DensePose estimation task.
  // http://densepose.org/
  message DensePoseEstimation {
    // Weight of the task loss. The total loss of the model will be their
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // Class ID (0-indexed) that corresponds to the object in the label map that
    // contains DensePose data.
    optional int32 class_id = 2;

    // Loss configuration for DensePose heatmap and regression losses. Note
    // that the localization loss is used for surface coordinate losses and
    // classification loss is used for part classification losses.
    optional Loss loss = 3;

    // The number of body parts.
    optional int32 num_parts = 4 [default = 24];

    // Loss weights for the two DensePose heads.
    optional float part_loss_weight = 5 [default = 1.0];
    optional float coordinate_loss_weight = 6 [default = 1.0];

    // Whether to upsample the prediction feature maps back to the original
    // input dimension prior to applying loss. This has the benefit of
    // maintaining finer groundtruth location information.
    optional bool upsample_to_input_res = 7 [default = true];

    // The initial bias value of the convlution kernel of the class heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1.
    optional float heatmap_bias_init = 8 [default = -2.19];
  }
  optional DensePoseEstimation densepose_estimation_task = 9;
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387

  // Parameters which are related to tracking embedding estimation task.
  // A Simple Baseline for Multi-Object Tracking [2]
  // [2]: https://arxiv.org/abs/2004.01888
  message TrackEstimation {
    // Weight of the task loss. The total loss of the model will be the
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // The maximun track ID of the datset.
    optional int32 num_track_ids = 2;

    // The embedding size for re-identification (ReID) task in tracking.
    optional int32 reid_embed_size = 3 [default = 128];

    // The number of (fully-connected, batch-norm, relu) layers for track ID
    // classification head. The output dimension of each intermediate FC layer
    // will all be 'reid_embed_size'. The last FC layer will directly project to
    // the track ID classification space of size 'num_track_ids' without
    // batch-norm and relu layers.
    optional int32 num_fc_layers = 4 [default = 1];

    // Classification loss configuration for ReID loss.
    optional ClassificationLoss classification_loss = 5;
  }
  optional TrackEstimation track_estimation_task = 10;
388

389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
  // Temporal offset prediction head similar to CenterTrack.
  // Currently our implementation adopts LSTM, different from original paper.
  // See go/lstd-centernet for more details.
  // Tracking Objects as Points [3]
  // [3]: https://arxiv.org/abs/2004.01177
  message TemporalOffsetEstimation {
    // Weight of the task loss. The total loss of the model will be the
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // Localization loss configuration for offset loss.
    optional LocalizationLoss localization_loss = 2;
  }
  optional TemporalOffsetEstimation temporal_offset_task = 12;

404

405
  // Mask prediction support using DeepMAC. See https://arxiv.org/abs/2104.00613
406
  // Next ID 34
407
408
409
410
411
412
413
414
415
416
417
  message DeepMACMaskEstimation {
    // The loss used for penalizing mask predictions.
    optional ClassificationLoss classification_loss = 1;

    // Weight of mask prediction loss
    optional float task_loss_weight = 2 [default = 1.0];

    // The dimension of the per-instance embedding.
    optional int32 dim = 3 [default = 256];

    // The dimension of the per-pixel embedding
418
    optional int32 pixel_embedding_dim = 4 [default = 16];
419
420
421
422
423
424
425
426

    // If set, masks are only kept for classes listed here. Masks are deleted
    // for all other classes. Note that this is only done at training time, eval
    // behavior is unchanged.
    repeated int32 allowed_masked_classes_ids = 5;

    // The size of cropped pixel embedding that goes into the 2D mask prediction
    // network (RoI align).
427
    optional int32 mask_size = 6 [default = 32];
428
429
430

    // If set to a positive value, we subsample instances by this amount to
    // save memory during training.
431
    optional int32 mask_num_subsamples = 67 [default = -1];
432
433

    // Whether or not to use (x, y) coordinates as input to mask net.
434
    optional bool use_xy = 8 [default = true];
435
436

    // Defines the kind of architecture we want to use for mask network.
437
    optional string network_type = 9 [default = "hourglass52"];
438
439

    // Whether or not we want to use instance embedding in mask network.
440
    optional bool use_instance_embedding = 10 [default = true];
441
442

    // Number of channels in the inital block of the mask prediction network.
443
    optional int32 num_init_channels = 11 [default = 64];
444
445
446
447

    // Whether or not to predict masks at full resolution. If true, we predict
    // masks at the resolution of the output stride. Otherwise, masks are
    // predicted at resolution defined by mask_size
448
    optional bool predict_full_resolution_masks = 12 [default = false];
449
450
451
452
453

    // If predict_full_resolution_masks is set, this parameter controls the size
    // of cropped masks returned by post-process. To be compatible with the rest
    // of the API, masks are always cropped and resized according to detected
    // boxes in postprocess.
454
    optional int32 postprocess_crop_size = 13 [default = 256];
455
456
457
458

    // The maximum relative amount by which boxes will be jittered before
    // RoI crop happens. The x and y coordinates of the box are jittered
    // relative to width and height respectively.
459
    optional float max_roi_jitter_ratio = 14 [default = 0.0];
460
461
462

    // The mode for jitterting box ROIs. See RandomJitterBoxes in
    // preprocessor.proto for more details
463
    optional RandomJitterBoxes.JitterMode jitter_mode = 15 [default = DEFAULT];
464
465
466

    // Weight for the box consistency loss as described in the BoxInst paper
    // https://arxiv.org/abs/2012.02310
467
    optional float box_consistency_loss_weight = 16 [default = 0.0];
468

469
    optional float color_consistency_threshold = 17 [default = 0.4];
470

471
    optional int32 color_consistency_dilation = 18 [default = 2];
472

473
    optional float color_consistency_loss_weight = 19 [default = 0.0];
474

475
476
    optional LossNormalize box_consistency_loss_normalize = 20
        [default = NORMALIZE_AUTO];
477
478
479
480
481

    // If set, will use the bounding box tightness prior approach. This means
    // that the max will be restricted to only be inside the box for both
    // dimensions. See details here:
    // https://papers.nips.cc/paper/2019/hash/e6e713296627dff6475085cc6a224464-Abstract.html
482
    optional bool box_consistency_tightness = 21 [default = false];
483

484
    optional int32 color_consistency_warmup_steps = 22 [default = 0];
485

486
487
488
489
    optional int32 color_consistency_warmup_start = 23 [default = 0];

    // This flag controls whether or not we use the outputs from only the
    // last stage of the hourglass for training the mask-heads.
490

491
492
    // DeepMAC has been refactored to process the entire batch at once,
    // instead of the previous (simple) approach of processing one sample at
493
494
495
496
497
    // a time. Because of this, we need to set this flag to continue using
    // the old models with the same training hardware.

    // This flag is not needed for 1024x1024 models. The performance and
    // memory usage are same as before.
498

499
500
501
502
503
504
505
506
    // For 512x512 models
    // - Setting this flag to true will let the model train on TPU-v3 32
    //   chips. We observed a small (0.26 mAP) performance drop when doing so.
    // - Setting this flag to false (default) increases the TPU requirement
    //   to TPU-v3 128 and reproduces previously demonstrated performance
    //   within error bars.

    optional bool use_only_last_stage = 24 [default = false];
507

508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
    optional float augmented_self_supervision_max_translation = 25 [default=0.0];

    optional float augmented_self_supervision_flip_probability = 26 [default=0.0];

    optional float augmented_self_supervision_loss_weight = 27 [default=0.0];

    optional int32 augmented_self_supervision_warmup_start = 28 [default=0];

    optional int32 augmented_self_supervision_warmup_steps = 29 [default=0];

    optional  AugmentedSelfSupervisionLoss augmented_self_supervision_loss = 30 [default=LOSS_DICE];

    optional float augmented_self_supervision_scale_min = 31 [default=1.0];

    optional float augmented_self_supervision_scale_max = 32 [default=1.0];
523
524
525
526
527
528
529
530
531
532
533

    // The loss weight for the pointly supervised loss as defined in the paper
    // https://arxiv.org/abs/2104.06404

    // We assume that point supervision is given through a keypoint dataset,
    // where each keypoint represents a sampled point, and its depth indicates
    // whether it is a foreground or background point.
    // Depth = +1 is assumed to be foreground and
    // Depth = -1 is assumed to be background.
    optional float pointly_supervised_keypoint_loss_weight = 33 [default = 0.0];

534
535
536
537
  }

  optional DeepMACMaskEstimation deepmac_mask_estimation = 14;

538
539
540
541
542
543
  // CenterNet does not apply conventional post processing operations such as
  // non max suppression as it applies a max-pool operator on box centers.
  // However, in some cases we observe the need to remove duplicate predictions
  // from CenterNet. Use this optional parameter to apply traditional non max
  // suppression and score thresholding.
  optional PostProcessing post_processing = 24;
544
545
546
547

  // If set, dictionary items returned by the predict() function
  // are appended to the output of postprocess().
  optional bool output_prediction_dict = 25 [default = false];
548
549
}

550
enum LossNormalize {
551
  NORMALIZE_AUTO = 0;  // SUM for 2D inputs (dice loss) and MEAN for others.
552
553
554
555
  NORMALIZE_GROUNDTRUTH_COUNT = 1;
  NORMALIZE_BALANCED = 3;
}

556
557
558
559
560
561
562
enum AugmentedSelfSupervisionLoss {
  LOSS_UNSET = 0;
  LOSS_DICE = 1;
  LOSS_MSE = 2;
  LOSS_KL_DIV = 3;
}

563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
message CenterNetFeatureExtractor {
  optional string type = 1;

  // Channel means to be subtracted from each image channel. If not specified,
  // we use a default value of 0.
  repeated float channel_means = 2;

  // Channel standard deviations. Each channel will be normalized by dividing
  // it by its standard deviation. If not specified, we use a default value
  // of 1.
  repeated float channel_stds = 3;

  // If set, will change channel order to be [blue, green, red]. This can be
  // useful to be compatible with some pre-trained feature extractors.
  optional bool bgr_ordering = 4 [default = false];
578
579
580
581
582

  // If set, the feature upsampling layers will be constructed with
  // separable convolutions. This is typically applied to feature pyramid
  // network if any.
  optional bool use_depthwise = 5 [default = false];
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
583

584

585
586
  // Depth multiplier. Only valid for specific models (e.g. MobileNet). See
  // subclasses of `CenterNetFeatureExtractor`.
587
588
589
590
591
  optional float depth_multiplier = 9 [default = 1.0];

  // Whether to use separable convolutions. Only valid for specific
  // models. See subclasses of `CenterNetFeatureExtractor`.
  optional bool use_separable_conv = 10 [default = false];
592
593
594
595
596

  // Which interpolation method to use for the upsampling ops in the FPN.
  // Currently only valid for CenterNetMobileNetV2FPNFeatureExtractor. The value
  // can be on of 'nearest' or 'bilinear'.
  optional string upsampling_interpolation = 11 [default = 'nearest'];
597
}
A. Unique TensorFlower's avatar
A. Unique TensorFlower committed
598