Commit 130970ca authored by Vighnesh Birodkar's avatar Vighnesh Birodkar Committed by TF Object Detection Team
Browse files

Document use_only_last_stage flag in configs.

PiperOrigin-RevId: 426512912
parent e5e8bf3c
......@@ -3,7 +3,6 @@
# mask head. This config is only trained on masks from the VOC classes in COCO
# and achieves a mask mAP of 32.5% on non-VOC classes.
# [1]: https://arxiv.org/abs/2104.00613
# [2]: https://arxiv.org/abs/1904.07850
# Train on TPU-32
......@@ -55,6 +54,7 @@ model {
classification_loss {
weighted_sigmoid {}
}
use_only_last_stage: true
allowed_masked_classes_ids: [
1, # person
......
......@@ -19,13 +19,16 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.python.keras.applications import resnet
import tensorflow.compat.v1 as tf
from object_detection.core import freezable_batch_norm
from object_detection.models.keras_models import model_utils
try:
from keras.applications import resnet # pylint:disable=g-import-not-at-top
except ImportError:
from tensorflow.python.keras.applications import resnet # pylint:disable=g-import-not-at-top
def _fixed_padding(inputs, kernel_size, rate=1): # pylint: disable=invalid-name
"""Pads the input along the spatial dimensions independently of input size.
......
......@@ -220,8 +220,8 @@ message CenterNet {
// scores * exp((-distances^2) / (2 * sigma^2))
// where 'distances' is the distance between the heatmap peak location and
// the regressed joint location and 'sigma' is the Gaussian standard
// deviation used in generating the Gaussian heatmap target multiplied by the
// 'std_dev_multiplier'.
// deviation used in generating the Gaussian heatmap target multiplied by
// the 'std_dev_multiplier'.
optional float std_dev_multiplier = 29 [default = 1.0];
// The radius (in the unit of output pixel) around heatmap peak to assign
......@@ -415,7 +415,7 @@ message CenterNet {
optional int32 dim = 3 [default = 256];
// The dimension of the per-pixel embedding
optional int32 pixel_embedding_dim = 4 [default=16];
optional int32 pixel_embedding_dim = 4 [default = 16];
// If set, masks are only kept for classes listed here. Masks are deleted
// for all other classes. Note that this is only done at training time, eval
......@@ -424,75 +424,86 @@ message CenterNet {
// The size of cropped pixel embedding that goes into the 2D mask prediction
// network (RoI align).
optional int32 mask_size = 6 [default=32];
optional int32 mask_size = 6 [default = 32];
// If set to a positive value, we subsample instances by this amount to
// save memory during training.
optional int32 mask_num_subsamples = 67[default=-1];
optional int32 mask_num_subsamples = 67 [default = -1];
// Whether or not to use (x, y) coordinates as input to mask net.
optional bool use_xy = 8 [default=true];
optional bool use_xy = 8 [default = true];
// Defines the kind of architecture we want to use for mask network.
optional string network_type = 9 [default="hourglass52"];
optional string network_type = 9 [default = "hourglass52"];
// Whether or not we want to use instance embedding in mask network.
optional bool use_instance_embedding = 10 [default=true];
optional bool use_instance_embedding = 10 [default = true];
// Number of channels in the inital block of the mask prediction network.
optional int32 num_init_channels = 11 [default=64];
optional int32 num_init_channels = 11 [default = 64];
// Whether or not to predict masks at full resolution. If true, we predict
// masks at the resolution of the output stride. Otherwise, masks are
// predicted at resolution defined by mask_size
optional bool predict_full_resolution_masks = 12 [default=false];
optional bool predict_full_resolution_masks = 12 [default = false];
// If predict_full_resolution_masks is set, this parameter controls the size
// of cropped masks returned by post-process. To be compatible with the rest
// of the API, masks are always cropped and resized according to detected
// boxes in postprocess.
optional int32 postprocess_crop_size = 13 [default=256];
optional int32 postprocess_crop_size = 13 [default = 256];
// The maximum relative amount by which boxes will be jittered before
// RoI crop happens. The x and y coordinates of the box are jittered
// relative to width and height respectively.
optional float max_roi_jitter_ratio = 14 [default=0.0];
optional float max_roi_jitter_ratio = 14 [default = 0.0];
// The mode for jitterting box ROIs. See RandomJitterBoxes in
// preprocessor.proto for more details
optional RandomJitterBoxes.JitterMode jitter_mode = 15 [default=DEFAULT];
optional RandomJitterBoxes.JitterMode jitter_mode = 15 [default = DEFAULT];
// Weight for the box consistency loss as described in the BoxInst paper
// https://arxiv.org/abs/2012.02310
optional float box_consistency_loss_weight = 16 [default=0.0];
optional float box_consistency_loss_weight = 16 [default = 0.0];
optional float color_consistency_threshold = 17 [default=0.4];
optional float color_consistency_threshold = 17 [default = 0.4];
optional int32 color_consistency_dilation = 18 [default=2];
optional int32 color_consistency_dilation = 18 [default = 2];
optional float color_consistency_loss_weight = 19 [default=0.0];
optional float color_consistency_loss_weight = 19 [default = 0.0];
optional LossNormalize box_consistency_loss_normalize = 20 [
default=NORMALIZE_AUTO];
optional LossNormalize box_consistency_loss_normalize = 20
[default = NORMALIZE_AUTO];
// If set, will use the bounding box tightness prior approach. This means
// that the max will be restricted to only be inside the box for both
// dimensions. See details here:
// https://papers.nips.cc/paper/2019/hash/e6e713296627dff6475085cc6a224464-Abstract.html
optional bool box_consistency_tightness = 21 [default=false];
optional bool box_consistency_tightness = 21 [default = false];
optional int32 color_consistency_warmup_steps = 22 [default=0];
optional int32 color_consistency_warmup_steps = 22 [default = 0];
optional int32 color_consistency_warmup_start = 23 [default=0];
optional int32 color_consistency_warmup_start = 23 [default = 0];
// This flag controls whether or not we use the outputs from only the
// last stage of the hourglass for training the mask-heads.
// DeepMAC has been refactored to process the entire batch at once,
// instead of the previous (simple) approach of processing one sample at
// a time. Because of this, the memory consumption has increased and
// it's crucial to only feed the mask head the last stage outputs
// from the hourglass. Doing so halves the memory requirement of the
// mask head and does not cause a drop in evaluation metrics.
optional bool use_only_last_stage = 24 [default=false];
// a time. Because of this, we need to set this flag to continue using
// the old models with the same training hardware.
// This flag is not needed for 1024x1024 models. The performance and
// memory usage are same as before.
// For 512x512 models
// - Setting this flag to true will let the model train on TPU-v3 32
// chips. We observed a small (0.26 mAP) performance drop when doing so.
// - Setting this flag to false (default) increases the TPU requirement
// to TPU-v3 128 and reproduces previously demonstrated performance
// within error bars.
optional bool use_only_last_stage = 24 [default = false];
}
optional DeepMACMaskEstimation deepmac_mask_estimation = 14;
......@@ -547,4 +558,3 @@ message CenterNetFeatureExtractor {
optional string upsampling_interpolation = 11 [default = 'nearest'];
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment