center_net.proto

syntax = "proto2";

package object_detection.protos;

import "object_detection/protos/image_resizer.proto";
import "object_detection/protos/losses.proto";
import "object_detection/protos/post_processing.proto";
import "object_detection/protos/preprocessor.proto";

// Configuration for the CenterNet meta architecture from the "Objects as
// Points" paper [1]
// [1]: https://arxiv.org/abs/1904.07850

// Next Id = 26
message CenterNet {
  // Number of classes to predict.
  optional int32 num_classes = 1;

  // Feature extractor config.
  optional CenterNetFeatureExtractor feature_extractor = 2;

  // Image resizer for preprocessing the input image.
  optional ImageResizer image_resizer = 3;

  // If set, all task heads will be constructed with separable convolutions.
  optional bool use_depthwise = 13 [default = false];

  // Indicates whether or not to use the sparse version of the Op that computes
  // the center heatmaps. The sparse version scales better with number of
  // channels in the heatmap, but in some cases is known to cause an OOM error.
  // TODO(b/170989061) When bug is fixed, make this the default behavior.
  optional bool compute_heatmap_sparse = 15 [default = false];

  // Parameters to determine the model architecture/layers of the prediction
  // heads.
  message PredictionHeadParams {
    // The two fields: num_filters, kernel_sizes correspond to the parameters of
    // the convolutional layers used by the prediction head. If provided, the
    // length of the two repeated fields need to be the same and represents the
    // number of convolutional layers.

    // Corresponds to the "filters" argument in tf.keras.layers.Conv2D. If not
    // provided, the default value [256] will be used.
    repeated int32 num_filters = 1;

    // Corresponds to the "kernel_size" argument in tf.keras.layers.Conv2D. If
    // not provided, the default value [3] will be used.
    repeated int32 kernel_sizes = 2;
  }

  // Parameters which are related to object detection task.
  message ObjectDetection {
    // The original fields are moved to ObjectCenterParams or deleted.
    reserved 2, 5, 6, 7;

    // Weight of the task loss. The total loss of the model will be the
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // Weight for the offset localization loss.
    optional float offset_loss_weight = 3 [default = 1.0];

    // Weight for the height/width localization loss.
    optional float scale_loss_weight = 4 [default = 0.1];

    // Localization loss configuration for object scale and offset losses.
    optional LocalizationLoss localization_loss = 8;

    // Parameters to determine the architecture of the object scale prediction
    // head.
    optional PredictionHeadParams scale_head_params = 9;

    // Parameters to determine the architecture of the object offset prediction
    // head.
    optional PredictionHeadParams offset_head_params = 10;
  }
  optional ObjectDetection object_detection_task = 4;

  // Parameters related to object center prediction. This is required for both
  // object detection and keypoint estimation tasks.
  message ObjectCenterParams {
    // Weight for the object center loss.
    optional float object_center_loss_weight = 1 [default = 1.0];

    // Classification loss configuration for object center loss.
    optional ClassificationLoss classification_loss = 2;

    // The initial bias value of the convlution kernel of the class heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1. See "Focal Loss for Dense Object Detection"
    // at https://arxiv.org/abs/1708.02002.
    optional float heatmap_bias_init = 3 [default = -2.19];

    // The minimum IOU overlap boxes need to have to not be penalized.
    optional float min_box_overlap_iou = 4 [default = 0.7];

    // Maximum number of boxes to predict.
    optional int32 max_box_predictions = 5 [default = 100];

    // If set, loss is only computed for the labeled classes.
    optional bool use_labeled_classes = 6 [default = false];

    // The keypoint weights used for calculating the location of object center.
    // When the field is provided, the number of weights need to be the same as
    // the number of keypoints. The object center is calculated by the weighted
    // mean of the keypoint locations. When the field is not provided, the
    // object center is determined by the bounding box groundtruth annotations
    // (default behavior).
    repeated float keypoint_weights_for_center = 7;

    // Parameters to determine the architecture of the object center prediction
    // head.
    optional PredictionHeadParams center_head_params = 8;

    // Max pool kernel size to use to pull off peak score locations in a
    // neighborhood for the object detection heatmap.
    optional int32 peak_max_pool_kernel_size = 9 [default = 3];
  }
  optional ObjectCenterParams object_center_params = 5;

  // Path of the file that conatins the label map along with the keypoint
  // information, including the keypoint indices, corresponding labels, and the
  // corresponding class. The file should be the same one as used in the input
  // pipeline. Note that a plain text of StringIntLabelMap proto is expected in
  // this file.
  // It is required only if the keypoint estimation task is specified.
  optional string keypoint_label_map_path = 6;

  // Parameters which are related to keypoint estimation task.
  message KeypointEstimation {
    // Name of the task, e.g. "human pose". Note that the task name should be
    // unique to each keypoint task.
    optional string task_name = 1;

    // Weight of the task loss. The total loss of the model will be their
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 2 [default = 1.0];

    // Loss configuration for keypoint heatmap, offset, regression losses. Note
    // that the localization loss is used for offset/regression losses and
    // classification loss is used for heatmap loss.
    optional Loss loss = 3;

    // The name of the class that contains the keypoints for this task. This is
    // used to retrieve the corresponding keypoint indices from the label map.
    // Note that this corresponds to the "name" field, not "display_name".
    optional string keypoint_class_name = 4;

    // The standard deviation of the Gaussian kernel used to generate the
    // keypoint heatmap. The unit is the pixel in the output image. It is to
    // provide the flexibility of using different sizes of Gaussian kernel for
    // each keypoint class. Note that if provided, the keypoint standard
    // deviations will be overridden by the specified values here, otherwise,
    // the default value 5.0 will be used.
    // TODO(yuhuic): Update the default value once we found the best value.
    map<string, float> keypoint_label_to_std = 5;

    // Loss weights corresponding to different heads.
    optional float keypoint_regression_loss_weight = 6 [default = 1.0];
    optional float keypoint_heatmap_loss_weight = 7 [default = 1.0];
    optional float keypoint_offset_loss_weight = 8 [default = 1.0];

    // The initial bias value of the convolution kernel of the keypoint heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1. See "Focal Loss for Dense Object Detection"
    // at https://arxiv.org/abs/1708.02002.
    optional float heatmap_bias_init = 9 [default = -2.19];

    // The heatmap score threshold for a keypoint to become a valid candidate.
    optional float keypoint_candidate_score_threshold = 10 [default = 0.1];

    // The maximum number of candidates to retrieve for each keypoint.
    optional int32 num_candidates_per_keypoint = 11 [default = 100];

    // Max pool kernel size to use to pull off peak score locations in a
    // neighborhood (independently for each keypoint types).
    optional int32 peak_max_pool_kernel_size = 12 [default = 3];

    // The default score to use for regressed keypoints that are not
    // successfully snapped to a nearby candidate.
    optional float unmatched_keypoint_score = 13 [default = 0.1];

    // The multiplier to expand the bounding boxes (either the provided boxes or
    // those which tightly cover the regressed keypoints). Note that new
    // expanded box for an instance becomes the feasible search window for all
    // associated keypoints.
    optional float box_scale = 14 [default = 1.2];

    // The scale parameter that multiplies the largest dimension of a bounding
    // box. The resulting distance becomes a search radius for candidates in the
    // vicinity of each regressed keypoint.
    optional float candidate_search_scale = 15 [default = 0.3];

    // One of ['min_distance', 'score_distance_ratio',
    // 'score_scaled_distance_ratio', 'gaussian_weighted'] indicating how to
    // select the keypoint candidate.
    optional string candidate_ranking_mode = 16 [default = "min_distance"];

    // The score distance ratio offset, only used if candidate_ranking_mode is
    // 'score_distance_ratio'. The offset is used in the maximization of score
    // distance ratio, defined as:
    // keypoint_score / (distance + score_distance_offset)
    optional float score_distance_offset = 22 [default = 1.0];

    // A scalar used to multiply the bounding box size to be used as the offset
    // in the score-to-distance-ratio formula. Only applicable when the
    // candidate_ranking_mode is score_scaled_distance_ratio.
    // The keypoint candidates are ranked using the formula:
    //   ranking_score = score / (distance + offset)
    // where 'score' is the keypoint heatmap scores, 'distance' is the distance
    // between the heatmap peak location and the regressed joint location,
    // 'offset' is a function of the predicted bounding box:
    //   offset = max(bbox height, bbox width) * score_distance_multiplier
    optional float score_distance_multiplier = 28 [default = 0.1];

    // A scalar used to multiply the Gaussian standard deviation to control the
    // Gaussian kernel which is used to weight the candidates. Only applicable
    // when the candidate_ranking_mode is gaussian_weighted.
    // The keypoint candidates are ranked using the formula:
    //   scores * exp((-distances^2) / (2 * sigma^2))
    // where 'distances' is the distance between the heatmap peak location and
    // the regressed joint location and 'sigma' is the Gaussian standard
    // deviation used in generating the Gaussian heatmap target multiplied by
    // the 'std_dev_multiplier'.
    optional float std_dev_multiplier = 29 [default = 1.0];

    // The radius (in the unit of output pixel) around heatmap peak to assign
    // the offset targets. If set 0, then the offset target will only be
    // assigned to the heatmap peak (same behavior as the original paper).
    optional int32 offset_peak_radius = 17 [default = 0];

    // Indicates whether to assign offsets for each keypoint channel
    // separately. If set False, the output offset target has the shape
    // [batch_size, out_height, out_width, 2] (same behavior as the original
    // paper). If set True, the output offset target has the shape [batch_size,
    // out_height, out_width, 2 * num_keypoints] (recommended when the
    // offset_peak_radius is not zero).
    optional bool per_keypoint_offset = 18 [default = false];

    // Indicates whether to predict the depth of each keypoints. Note that this
    // is only supported in the single class keypoint task.
    optional bool predict_depth = 19 [default = false];

    // Indicates whether to predict depths for each keypoint channel
    // separately. If set False, the output depth target has the shape
    // [batch_size, out_height, out_width, 1]. If set True, the output depth
    // target has the shape [batch_size, out_height, out_width,
    // num_keypoints]. Recommend to set this value and "per_keypoint_offset" to
    // both be True at the same time.
    optional bool per_keypoint_depth = 20 [default = false];

    // The weight of the keypoint depth loss.
    optional float keypoint_depth_loss_weight = 21 [default = 1.0];

    // Whether keypoints outside the image frame should be clipped back to the
    // image boundary. If true, the keypoints that are clipped have scores set
    // to 0.0.
    optional bool clip_out_of_frame_keypoints = 23 [default = false];

    // Whether instances should be rescored based on keypoint confidences. If
    // False, will use the detection score (from the object center heatmap). If
    // True, will compute new scores with:
    // new_score = o * (1/k) sum {s_i}
    // where o is the object score, s_i is the score for keypoint i, and k is
    // the number of keypoints for that class.
    optional bool rescore_instances = 24 [default = false];

    // A scalar used when "rescore_instances" is set to True. The detection
    // score of an instance is set to be the average score among those keypoints
    // with scores higher than the threshold.
    optional float rescoring_threshold = 30 [default = 0.0];

    // The ratio used to multiply the output feature map size to determine the
    // denominator in the Gaussian formula. Only applicable when the
    // candidate_ranking_mode is set to be 'gaussian_weighted_const'.
    optional float gaussian_denom_ratio = 31 [default = 0.1];

    // Whether to use the keypoint postprocessing logic that replaces topk op
    // with argmax. Usually used when exporting the model for predicting
    // keypoints of multiple instances in the browser.
    optional bool argmax_postprocessing = 32 [default = false];

    // Parameters to determine the architecture of the keypoint heatmap
    // prediction head.
    optional PredictionHeadParams heatmap_head_params = 25;

    // Parameters to determine the architecture of the keypoint offset
    // prediction head.
    optional PredictionHeadParams offset_head_params = 26;

    // Parameters to determine the architecture of the keypoint regression
    // prediction head.
    optional PredictionHeadParams regress_head_params = 27;
  }
  repeated KeypointEstimation keypoint_estimation_task = 7;

  // Parameters which are related to mask estimation task.
  // Note: Currently, CenterNet supports a weak instance segmentation, where
  // semantic segmentation masks are estimated, and then cropped based on
  // bounding box detections. Therefore, it is possible for the same image
  // pixel to be assigned to multiple instances.
  message MaskEstimation {
    // Weight of the task loss. The total loss of the model will be their
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // Classification loss configuration for segmentation loss.
    optional ClassificationLoss classification_loss = 2;

    // Each instance mask (one per detection) is cropped and resized (bilinear
    // resampling) from the predicted segmentation feature map. After
    // resampling, the masks are binarized with the provided score threshold.
    optional int32 mask_height = 4 [default = 256];
    optional int32 mask_width = 5 [default = 256];
    optional float score_threshold = 6 [default = 0.5];

    // The initial bias value of the convlution kernel of the class heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1.
    optional float heatmap_bias_init = 3 [default = -2.19];

    // Parameters to determine the architecture of the segmentation mask
    // prediction head.
    optional PredictionHeadParams mask_head_params = 7;
  }
  optional MaskEstimation mask_estimation_task = 8;

  // Parameters which are related to DensePose estimation task.
  // http://densepose.org/
  message DensePoseEstimation {
    // Weight of the task loss. The total loss of the model will be their
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // Class ID (0-indexed) that corresponds to the object in the label map that
    // contains DensePose data.
    optional int32 class_id = 2;

    // Loss configuration for DensePose heatmap and regression losses. Note
    // that the localization loss is used for surface coordinate losses and
    // classification loss is used for part classification losses.
    optional Loss loss = 3;

    // The number of body parts.
    optional int32 num_parts = 4 [default = 24];

    // Loss weights for the two DensePose heads.
    optional float part_loss_weight = 5 [default = 1.0];
    optional float coordinate_loss_weight = 6 [default = 1.0];

    // Whether to upsample the prediction feature maps back to the original
    // input dimension prior to applying loss. This has the benefit of
    // maintaining finer groundtruth location information.
    optional bool upsample_to_input_res = 7 [default = true];

    // The initial bias value of the convlution kernel of the class heatmap
    // prediction head. -2.19 corresponds to predicting foreground with
    // a probability of 0.1.
    optional float heatmap_bias_init = 8 [default = -2.19];
  }
  optional DensePoseEstimation densepose_estimation_task = 9;

  // Parameters which are related to tracking embedding estimation task.
  // A Simple Baseline for Multi-Object Tracking [2]
  // [2]: https://arxiv.org/abs/2004.01888
  message TrackEstimation {
    // Weight of the task loss. The total loss of the model will be the
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // The maximun track ID of the datset.
    optional int32 num_track_ids = 2;

    // The embedding size for re-identification (ReID) task in tracking.
    optional int32 reid_embed_size = 3 [default = 128];

    // The number of (fully-connected, batch-norm, relu) layers for track ID
    // classification head. The output dimension of each intermediate FC layer
    // will all be 'reid_embed_size'. The last FC layer will directly project to
    // the track ID classification space of size 'num_track_ids' without
    // batch-norm and relu layers.
    optional int32 num_fc_layers = 4 [default = 1];

    // Classification loss configuration for ReID loss.
    optional ClassificationLoss classification_loss = 5;
  }
  optional TrackEstimation track_estimation_task = 10;

  // Temporal offset prediction head similar to CenterTrack.
  // Currently our implementation adopts LSTM, different from original paper.
  // See go/lstd-centernet for more details.
  // Tracking Objects as Points [3]
  // [3]: https://arxiv.org/abs/2004.01177
  message TemporalOffsetEstimation {
    // Weight of the task loss. The total loss of the model will be the
    // summation of task losses weighted by the weights.
    optional float task_loss_weight = 1 [default = 1.0];

    // Localization loss configuration for offset loss.
    optional LocalizationLoss localization_loss = 2;
  }
  optional TemporalOffsetEstimation temporal_offset_task = 12;


  // Mask prediction support using DeepMAC. See https://arxiv.org/abs/2104.00613
  // Next ID 34
  message DeepMACMaskEstimation {
    // The loss used for penalizing mask predictions.
    optional ClassificationLoss classification_loss = 1;

    // Weight of mask prediction loss
    optional float task_loss_weight = 2 [default = 1.0];

    // The dimension of the per-instance embedding.
    optional int32 dim = 3 [default = 256];

    // The dimension of the per-pixel embedding
    optional int32 pixel_embedding_dim = 4 [default = 16];

    // If set, masks are only kept for classes listed here. Masks are deleted
    // for all other classes. Note that this is only done at training time, eval
    // behavior is unchanged.
    repeated int32 allowed_masked_classes_ids = 5;

    // The size of cropped pixel embedding that goes into the 2D mask prediction
    // network (RoI align).
    optional int32 mask_size = 6 [default = 32];

    // If set to a positive value, we subsample instances by this amount to
    // save memory during training.
    optional int32 mask_num_subsamples = 67 [default = -1];

    // Whether or not to use (x, y) coordinates as input to mask net.
    optional bool use_xy = 8 [default = true];

    // Defines the kind of architecture we want to use for mask network.
    optional string network_type = 9 [default = "hourglass52"];

    // Whether or not we want to use instance embedding in mask network.
    optional bool use_instance_embedding = 10 [default = true];

    // Number of channels in the inital block of the mask prediction network.
    optional int32 num_init_channels = 11 [default = 64];

    // Whether or not to predict masks at full resolution. If true, we predict
    // masks at the resolution of the output stride. Otherwise, masks are
    // predicted at resolution defined by mask_size
    optional bool predict_full_resolution_masks = 12 [default = false];

    // If predict_full_resolution_masks is set, this parameter controls the size
    // of cropped masks returned by post-process. To be compatible with the rest
    // of the API, masks are always cropped and resized according to detected
    // boxes in postprocess.
    optional int32 postprocess_crop_size = 13 [default = 256];

    // The maximum relative amount by which boxes will be jittered before
    // RoI crop happens. The x and y coordinates of the box are jittered
    // relative to width and height respectively.
    optional float max_roi_jitter_ratio = 14 [default = 0.0];

    // The mode for jitterting box ROIs. See RandomJitterBoxes in
    // preprocessor.proto for more details
    optional RandomJitterBoxes.JitterMode jitter_mode = 15 [default = DEFAULT];

    // Weight for the box consistency loss as described in the BoxInst paper
    // https://arxiv.org/abs/2012.02310
    optional float box_consistency_loss_weight = 16 [default = 0.0];

    optional float color_consistency_threshold = 17 [default = 0.4];

    optional int32 color_consistency_dilation = 18 [default = 2];

    optional float color_consistency_loss_weight = 19 [default = 0.0];

    optional LossNormalize box_consistency_loss_normalize = 20
        [default = NORMALIZE_AUTO];

    // If set, will use the bounding box tightness prior approach. This means
    // that the max will be restricted to only be inside the box for both
    // dimensions. See details here:
    // https://papers.nips.cc/paper/2019/hash/e6e713296627dff6475085cc6a224464-Abstract.html
    optional bool box_consistency_tightness = 21 [default = false];

    optional int32 color_consistency_warmup_steps = 22 [default = 0];

    optional int32 color_consistency_warmup_start = 23 [default = 0];

    // This flag controls whether or not we use the outputs from only the
    // last stage of the hourglass for training the mask-heads.

    // DeepMAC has been refactored to process the entire batch at once,
    // instead of the previous (simple) approach of processing one sample at
    // a time. Because of this, we need to set this flag to continue using
    // the old models with the same training hardware.

    // This flag is not needed for 1024x1024 models. The performance and
    // memory usage are same as before.

    // For 512x512 models
    // - Setting this flag to true will let the model train on TPU-v3 32
    //   chips. We observed a small (0.26 mAP) performance drop when doing so.
    // - Setting this flag to false (default) increases the TPU requirement
    //   to TPU-v3 128 and reproduces previously demonstrated performance
    //   within error bars.

    optional bool use_only_last_stage = 24 [default = false];

    optional float augmented_self_supervision_max_translation = 25 [default=0.0];

    optional float augmented_self_supervision_flip_probability = 26 [default=0.0];

    optional float augmented_self_supervision_loss_weight = 27 [default=0.0];

    optional int32 augmented_self_supervision_warmup_start = 28 [default=0];

    optional int32 augmented_self_supervision_warmup_steps = 29 [default=0];

    optional  AugmentedSelfSupervisionLoss augmented_self_supervision_loss = 30 [default=LOSS_DICE];

    optional float augmented_self_supervision_scale_min = 31 [default=1.0];

    optional float augmented_self_supervision_scale_max = 32 [default=1.0];

    // The loss weight for the pointly supervised loss as defined in the paper
    // https://arxiv.org/abs/2104.06404

    // We assume that point supervision is given through a keypoint dataset,
    // where each keypoint represents a sampled point, and its depth indicates
    // whether it is a foreground or background point.
    // Depth = +1 is assumed to be foreground and
    // Depth = -1 is assumed to be background.
    optional float pointly_supervised_keypoint_loss_weight = 33 [default = 0.0];

  }

  optional DeepMACMaskEstimation deepmac_mask_estimation = 14;

  // CenterNet does not apply conventional post processing operations such as
  // non max suppression as it applies a max-pool operator on box centers.
  // However, in some cases we observe the need to remove duplicate predictions
  // from CenterNet. Use this optional parameter to apply traditional non max
  // suppression and score thresholding.
  optional PostProcessing post_processing = 24;

  // If set, dictionary items returned by the predict() function
  // are appended to the output of postprocess().
  optional bool output_prediction_dict = 25 [default = false];
}

enum LossNormalize {
  NORMALIZE_AUTO = 0;  // SUM for 2D inputs (dice loss) and MEAN for others.
  NORMALIZE_GROUNDTRUTH_COUNT = 1;
  NORMALIZE_BALANCED = 3;
}

enum AugmentedSelfSupervisionLoss {
  LOSS_UNSET = 0;
  LOSS_DICE = 1;
  LOSS_MSE = 2;
  LOSS_KL_DIV = 3;
}

message CenterNetFeatureExtractor {
  optional string type = 1;

  // Channel means to be subtracted from each image channel. If not specified,
  // we use a default value of 0.
  repeated float channel_means = 2;

  // Channel standard deviations. Each channel will be normalized by dividing
  // it by its standard deviation. If not specified, we use a default value
  // of 1.
  repeated float channel_stds = 3;

  // If set, will change channel order to be [blue, green, red]. This can be
  // useful to be compatible with some pre-trained feature extractors.
  optional bool bgr_ordering = 4 [default = false];

  // If set, the feature upsampling layers will be constructed with
  // separable convolutions. This is typically applied to feature pyramid
  // network if any.
  optional bool use_depthwise = 5 [default = false];


  // Depth multiplier. Only valid for specific models (e.g. MobileNet). See
  // subclasses of `CenterNetFeatureExtractor`.
  optional float depth_multiplier = 9 [default = 1.0];

  // Whether to use separable convolutions. Only valid for specific
  // models. See subclasses of `CenterNetFeatureExtractor`.
  optional bool use_separable_conv = 10 [default = false];

  // Which interpolation method to use for the upsampling ops in the FPN.
  // Currently only valid for CenterNetMobileNetV2FPNFeatureExtractor. The value
  // can be on of 'nearest' or 'bilinear'.
  optional string upsampling_interpolation = 11 [default = 'nearest'];
}