syntax = "proto2"; package object_detection.protos; import "object_detection/protos/image_resizer.proto"; import "object_detection/protos/losses.proto"; import "object_detection/protos/post_processing.proto"; // Configuration for the CenterNet meta architecture from the "Objects as // Points" paper [1] // [1]: https://arxiv.org/abs/1904.07850 // Next Id = 16 message CenterNet { // Number of classes to predict. optional int32 num_classes = 1; // Feature extractor config. optional CenterNetFeatureExtractor feature_extractor = 2; // Image resizer for preprocessing the input image. optional ImageResizer image_resizer = 3; // If set, all task heads will be constructed with separable convolutions. optional bool use_depthwise = 13 [default = false]; // Indicates whether or not to use the sparse version of the Op that computes // the center heatmaps. The sparse version scales better with number of // channels in the heatmap, but in some cases is known to cause an OOM error. // TODO(b/170989061) When bug is fixed, make this the default behavior. optional bool compute_heatmap_sparse = 15 [default = false]; // Parameters which are related to object detection task. message ObjectDetection { // The original fields are moved to ObjectCenterParams or deleted. reserved 2, 5, 6, 7; // Weight of the task loss. The total loss of the model will be the // summation of task losses weighted by the weights. optional float task_loss_weight = 1 [default = 1.0]; // Weight for the offset localization loss. optional float offset_loss_weight = 3 [default = 1.0]; // Weight for the height/width localization loss. optional float scale_loss_weight = 4 [default = 0.1]; // Localization loss configuration for object scale and offset losses. optional LocalizationLoss localization_loss = 8; } optional ObjectDetection object_detection_task = 4; // Parameters related to object center prediction. This is required for both // object detection and keypoint estimation tasks. message ObjectCenterParams { // Weight for the object center loss. optional float object_center_loss_weight = 1 [default = 1.0]; // Classification loss configuration for object center loss. optional ClassificationLoss classification_loss = 2; // The initial bias value of the convlution kernel of the class heatmap // prediction head. -2.19 corresponds to predicting foreground with // a probability of 0.1. See "Focal Loss for Dense Object Detection" // at https://arxiv.org/abs/1708.02002. optional float heatmap_bias_init = 3 [default = -2.19]; // The minimum IOU overlap boxes need to have to not be penalized. optional float min_box_overlap_iou = 4 [default = 0.7]; // Maximum number of boxes to predict. optional int32 max_box_predictions = 5 [default = 100]; // If set, loss is only computed for the labeled classes. optional bool use_labeled_classes = 6 [default = false]; // The keypoint weights used for calculating the location of object center. // When the field is provided, the number of weights need to be the same as // the number of keypoints. The object center is calculated by the weighted // mean of the keypoint locations. When the field is not provided, the // object center is determined by the bounding box groundtruth annotations // (default behavior). repeated float keypoint_weights_for_center = 7; } optional ObjectCenterParams object_center_params = 5; // Path of the file that conatins the label map along with the keypoint // information, including the keypoint indices, corresponding labels, and the // corresponding class. The file should be the same one as used in the input // pipeline. Note that a plain text of StringIntLabelMap proto is expected in // this file. // It is required only if the keypoint estimation task is specified. optional string keypoint_label_map_path = 6; // Parameters which are related to keypoint estimation task. message KeypointEstimation { // Name of the task, e.g. "human pose". Note that the task name should be // unique to each keypoint task. optional string task_name = 1; // Weight of the task loss. The total loss of the model will be their // summation of task losses weighted by the weights. optional float task_loss_weight = 2 [default = 1.0]; // Loss configuration for keypoint heatmap, offset, regression losses. Note // that the localization loss is used for offset/regression losses and // classification loss is used for heatmap loss. optional Loss loss = 3; // The name of the class that contains the keypoints for this task. This is // used to retrieve the corresponding keypoint indices from the label map. // Note that this corresponds to the "name" field, not "display_name". optional string keypoint_class_name = 4; // The standard deviation of the Gaussian kernel used to generate the // keypoint heatmap. The unit is the pixel in the output image. It is to // provide the flexibility of using different sizes of Gaussian kernel for // each keypoint class. Note that if provided, the keypoint standard // deviations will be overridden by the specified values here, otherwise, // the default value 5.0 will be used. // TODO(yuhuic): Update the default value once we found the best value. map keypoint_label_to_std = 5; // Loss weights corresponding to different heads. optional float keypoint_regression_loss_weight = 6 [default = 1.0]; optional float keypoint_heatmap_loss_weight = 7 [default = 1.0]; optional float keypoint_offset_loss_weight = 8 [default = 1.0]; // The initial bias value of the convolution kernel of the keypoint heatmap // prediction head. -2.19 corresponds to predicting foreground with // a probability of 0.1. See "Focal Loss for Dense Object Detection" // at https://arxiv.org/abs/1708.02002. optional float heatmap_bias_init = 9 [default = -2.19]; // The heatmap score threshold for a keypoint to become a valid candidate. optional float keypoint_candidate_score_threshold = 10 [default = 0.1]; // The maximum number of candidates to retrieve for each keypoint. optional int32 num_candidates_per_keypoint = 11 [default = 100]; // Max pool kernel size to use to pull off peak score locations in a // neighborhood (independently for each keypoint types). optional int32 peak_max_pool_kernel_size = 12 [default = 3]; // The default score to use for regressed keypoints that are not // successfully snapped to a nearby candidate. optional float unmatched_keypoint_score = 13 [default = 0.1]; // The multiplier to expand the bounding boxes (either the provided boxes or // those which tightly cover the regressed keypoints). Note that new // expanded box for an instance becomes the feasible search window for all // associated keypoints. optional float box_scale = 14 [default = 1.2]; // The scale parameter that multiplies the largest dimension of a bounding // box. The resulting distance becomes a search radius for candidates in the // vicinity of each regressed keypoint. optional float candidate_search_scale = 15 [default = 0.3]; // One of ['min_distance', 'score_distance_ratio'] indicating how to select // the keypoint candidate. optional string candidate_ranking_mode = 16 [default = "min_distance"]; // The score distance ratio offset, only used if candidate_ranking_mode is // 'score_distance_ratio'. The offset is used in the maximization of score // distance ratio, defined as: // keypoint_score / (distance + score_distance_offset) optional float score_distance_offset = 22 [default = 1.0]; // The radius (in the unit of output pixel) around heatmap peak to assign // the offset targets. If set 0, then the offset target will only be // assigned to the heatmap peak (same behavior as the original paper). optional int32 offset_peak_radius = 17 [default = 0]; // Indicates whether to assign offsets for each keypoint channel // separately. If set False, the output offset target has the shape // [batch_size, out_height, out_width, 2] (same behavior as the original // paper). If set True, the output offset target has the shape [batch_size, // out_height, out_width, 2 * num_keypoints] (recommended when the // offset_peak_radius is not zero). optional bool per_keypoint_offset = 18 [default = false]; // Indicates whether to predict the depth of each keypoints. Note that this // is only supported in the single class keypoint task. optional bool predict_depth = 19 [default = false]; // Indicates whether to predict depths for each keypoint channel // separately. If set False, the output depth target has the shape // [batch_size, out_height, out_width, 1]. If set True, the output depth // target has the shape [batch_size, out_height, out_width, // num_keypoints]. Recommend to set this value and "per_keypoint_offset" to // both be True at the same time. optional bool per_keypoint_depth = 20 [default = false]; // The weight of the keypoint depth loss. optional float keypoint_depth_loss_weight = 21 [default = 1.0]; // Whether keypoints outside the image frame should be clipped back to the // image boundary. If true, the keypoints that are clipped have scores set // to 0.0. optional bool clip_out_of_frame_keypoints = 23 [default = false]; // Whether instances should be rescored based on keypoint confidences. If // False, will use the detection score (from the object center heatmap). If // True, will compute new scores with: // new_score = o * (1/k) sum {s_i} // where o is the object score, s_i is the score for keypoint i, and k is // the number of keypoints for that class. optional bool rescore_instances = 24 [default = false]; } repeated KeypointEstimation keypoint_estimation_task = 7; // Parameters which are related to mask estimation task. // Note: Currently, CenterNet supports a weak instance segmentation, where // semantic segmentation masks are estimated, and then cropped based on // bounding box detections. Therefore, it is possible for the same image // pixel to be assigned to multiple instances. message MaskEstimation { // Weight of the task loss. The total loss of the model will be their // summation of task losses weighted by the weights. optional float task_loss_weight = 1 [default = 1.0]; // Classification loss configuration for segmentation loss. optional ClassificationLoss classification_loss = 2; // Each instance mask (one per detection) is cropped and resized (bilinear // resampling) from the predicted segmentation feature map. After // resampling, the masks are binarized with the provided score threshold. optional int32 mask_height = 4 [default = 256]; optional int32 mask_width = 5 [default = 256]; optional float score_threshold = 6 [default = 0.5]; // The initial bias value of the convlution kernel of the class heatmap // prediction head. -2.19 corresponds to predicting foreground with // a probability of 0.1. optional float heatmap_bias_init = 3 [default = -2.19]; } optional MaskEstimation mask_estimation_task = 8; // Parameters which are related to DensePose estimation task. // http://densepose.org/ message DensePoseEstimation { // Weight of the task loss. The total loss of the model will be their // summation of task losses weighted by the weights. optional float task_loss_weight = 1 [default = 1.0]; // Class ID (0-indexed) that corresponds to the object in the label map that // contains DensePose data. optional int32 class_id = 2; // Loss configuration for DensePose heatmap and regression losses. Note // that the localization loss is used for surface coordinate losses and // classification loss is used for part classification losses. optional Loss loss = 3; // The number of body parts. optional int32 num_parts = 4 [default = 24]; // Loss weights for the two DensePose heads. optional float part_loss_weight = 5 [default = 1.0]; optional float coordinate_loss_weight = 6 [default = 1.0]; // Whether to upsample the prediction feature maps back to the original // input dimension prior to applying loss. This has the benefit of // maintaining finer groundtruth location information. optional bool upsample_to_input_res = 7 [default = true]; // The initial bias value of the convlution kernel of the class heatmap // prediction head. -2.19 corresponds to predicting foreground with // a probability of 0.1. optional float heatmap_bias_init = 8 [default = -2.19]; } optional DensePoseEstimation densepose_estimation_task = 9; // Parameters which are related to tracking embedding estimation task. // A Simple Baseline for Multi-Object Tracking [2] // [2]: https://arxiv.org/abs/2004.01888 message TrackEstimation { // Weight of the task loss. The total loss of the model will be the // summation of task losses weighted by the weights. optional float task_loss_weight = 1 [default = 1.0]; // The maximun track ID of the datset. optional int32 num_track_ids = 2; // The embedding size for re-identification (ReID) task in tracking. optional int32 reid_embed_size = 3 [default = 128]; // The number of (fully-connected, batch-norm, relu) layers for track ID // classification head. The output dimension of each intermediate FC layer // will all be 'reid_embed_size'. The last FC layer will directly project to // the track ID classification space of size 'num_track_ids' without // batch-norm and relu layers. optional int32 num_fc_layers = 4 [default = 1]; // Classification loss configuration for ReID loss. optional ClassificationLoss classification_loss = 5; } optional TrackEstimation track_estimation_task = 10; // Temporal offset prediction head similar to CenterTrack. // Currently our implementation adopts LSTM, different from original paper. // See go/lstd-centernet for more details. // Tracking Objects as Points [3] // [3]: https://arxiv.org/abs/2004.01177 message TemporalOffsetEstimation { // Weight of the task loss. The total loss of the model will be the // summation of task losses weighted by the weights. optional float task_loss_weight = 1 [default = 1.0]; // Localization loss configuration for offset loss. optional LocalizationLoss localization_loss = 2; } optional TemporalOffsetEstimation temporal_offset_task = 12; // CenterNet does not apply conventional post processing operations such as // non max suppression as it applies a max-pool operator on box centers. // However, in some cases we observe the need to remove duplicate predictions // from CenterNet. Use this optional parameter to apply traditional non max // suppression and score thresholding. optional PostProcessing post_processing = 24; } message CenterNetFeatureExtractor { optional string type = 1; // Channel means to be subtracted from each image channel. If not specified, // we use a default value of 0. repeated float channel_means = 2; // Channel standard deviations. Each channel will be normalized by dividing // it by its standard deviation. If not specified, we use a default value // of 1. repeated float channel_stds = 3; // If set, will change channel order to be [blue, green, red]. This can be // useful to be compatible with some pre-trained feature extractors. optional bool bgr_ordering = 4 [default = false]; // If set, the feature upsampling layers will be constructed with // separable convolutions. This is typically applied to feature pyramid // network if any. optional bool use_depthwise = 5 [default = false]; }