Commit 130970ca authored by Vighnesh Birodkar's avatar Vighnesh Birodkar Committed by TF Object Detection Team
Browse files

Document use_only_last_stage flag in configs.

PiperOrigin-RevId: 426512912
parent e5e8bf3c
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
# mask head. This config is only trained on masks from the VOC classes in COCO # mask head. This config is only trained on masks from the VOC classes in COCO
# and achieves a mask mAP of 32.5% on non-VOC classes. # and achieves a mask mAP of 32.5% on non-VOC classes.
# [1]: https://arxiv.org/abs/2104.00613 # [1]: https://arxiv.org/abs/2104.00613
# [2]: https://arxiv.org/abs/1904.07850
# Train on TPU-32 # Train on TPU-32
...@@ -55,6 +54,7 @@ model { ...@@ -55,6 +54,7 @@ model {
classification_loss { classification_loss {
weighted_sigmoid {} weighted_sigmoid {}
} }
use_only_last_stage: true
allowed_masked_classes_ids: [ allowed_masked_classes_ids: [
1, # person 1, # person
......
...@@ -19,13 +19,16 @@ from __future__ import absolute_import ...@@ -19,13 +19,16 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from tensorflow.python.keras.applications import resnet
import tensorflow.compat.v1 as tf import tensorflow.compat.v1 as tf
from object_detection.core import freezable_batch_norm from object_detection.core import freezable_batch_norm
from object_detection.models.keras_models import model_utils from object_detection.models.keras_models import model_utils
try:
from keras.applications import resnet # pylint:disable=g-import-not-at-top
except ImportError:
from tensorflow.python.keras.applications import resnet # pylint:disable=g-import-not-at-top
def _fixed_padding(inputs, kernel_size, rate=1): # pylint: disable=invalid-name def _fixed_padding(inputs, kernel_size, rate=1): # pylint: disable=invalid-name
"""Pads the input along the spatial dimensions independently of input size. """Pads the input along the spatial dimensions independently of input size.
......
...@@ -220,8 +220,8 @@ message CenterNet { ...@@ -220,8 +220,8 @@ message CenterNet {
// scores * exp((-distances^2) / (2 * sigma^2)) // scores * exp((-distances^2) / (2 * sigma^2))
// where 'distances' is the distance between the heatmap peak location and // where 'distances' is the distance between the heatmap peak location and
// the regressed joint location and 'sigma' is the Gaussian standard // the regressed joint location and 'sigma' is the Gaussian standard
// deviation used in generating the Gaussian heatmap target multiplied by the // deviation used in generating the Gaussian heatmap target multiplied by
// 'std_dev_multiplier'. // the 'std_dev_multiplier'.
optional float std_dev_multiplier = 29 [default = 1.0]; optional float std_dev_multiplier = 29 [default = 1.0];
// The radius (in the unit of output pixel) around heatmap peak to assign // The radius (in the unit of output pixel) around heatmap peak to assign
...@@ -415,7 +415,7 @@ message CenterNet { ...@@ -415,7 +415,7 @@ message CenterNet {
optional int32 dim = 3 [default = 256]; optional int32 dim = 3 [default = 256];
// The dimension of the per-pixel embedding // The dimension of the per-pixel embedding
optional int32 pixel_embedding_dim = 4 [default=16]; optional int32 pixel_embedding_dim = 4 [default = 16];
// If set, masks are only kept for classes listed here. Masks are deleted // If set, masks are only kept for classes listed here. Masks are deleted
// for all other classes. Note that this is only done at training time, eval // for all other classes. Note that this is only done at training time, eval
...@@ -424,75 +424,86 @@ message CenterNet { ...@@ -424,75 +424,86 @@ message CenterNet {
// The size of cropped pixel embedding that goes into the 2D mask prediction // The size of cropped pixel embedding that goes into the 2D mask prediction
// network (RoI align). // network (RoI align).
optional int32 mask_size = 6 [default=32]; optional int32 mask_size = 6 [default = 32];
// If set to a positive value, we subsample instances by this amount to // If set to a positive value, we subsample instances by this amount to
// save memory during training. // save memory during training.
optional int32 mask_num_subsamples = 67[default=-1]; optional int32 mask_num_subsamples = 67 [default = -1];
// Whether or not to use (x, y) coordinates as input to mask net. // Whether or not to use (x, y) coordinates as input to mask net.
optional bool use_xy = 8 [default=true]; optional bool use_xy = 8 [default = true];
// Defines the kind of architecture we want to use for mask network. // Defines the kind of architecture we want to use for mask network.
optional string network_type = 9 [default="hourglass52"]; optional string network_type = 9 [default = "hourglass52"];
// Whether or not we want to use instance embedding in mask network. // Whether or not we want to use instance embedding in mask network.
optional bool use_instance_embedding = 10 [default=true]; optional bool use_instance_embedding = 10 [default = true];
// Number of channels in the inital block of the mask prediction network. // Number of channels in the inital block of the mask prediction network.
optional int32 num_init_channels = 11 [default=64]; optional int32 num_init_channels = 11 [default = 64];
// Whether or not to predict masks at full resolution. If true, we predict // Whether or not to predict masks at full resolution. If true, we predict
// masks at the resolution of the output stride. Otherwise, masks are // masks at the resolution of the output stride. Otherwise, masks are
// predicted at resolution defined by mask_size // predicted at resolution defined by mask_size
optional bool predict_full_resolution_masks = 12 [default=false]; optional bool predict_full_resolution_masks = 12 [default = false];
// If predict_full_resolution_masks is set, this parameter controls the size // If predict_full_resolution_masks is set, this parameter controls the size
// of cropped masks returned by post-process. To be compatible with the rest // of cropped masks returned by post-process. To be compatible with the rest
// of the API, masks are always cropped and resized according to detected // of the API, masks are always cropped and resized according to detected
// boxes in postprocess. // boxes in postprocess.
optional int32 postprocess_crop_size = 13 [default=256]; optional int32 postprocess_crop_size = 13 [default = 256];
// The maximum relative amount by which boxes will be jittered before // The maximum relative amount by which boxes will be jittered before
// RoI crop happens. The x and y coordinates of the box are jittered // RoI crop happens. The x and y coordinates of the box are jittered
// relative to width and height respectively. // relative to width and height respectively.
optional float max_roi_jitter_ratio = 14 [default=0.0]; optional float max_roi_jitter_ratio = 14 [default = 0.0];
// The mode for jitterting box ROIs. See RandomJitterBoxes in // The mode for jitterting box ROIs. See RandomJitterBoxes in
// preprocessor.proto for more details // preprocessor.proto for more details
optional RandomJitterBoxes.JitterMode jitter_mode = 15 [default=DEFAULT]; optional RandomJitterBoxes.JitterMode jitter_mode = 15 [default = DEFAULT];
// Weight for the box consistency loss as described in the BoxInst paper // Weight for the box consistency loss as described in the BoxInst paper
// https://arxiv.org/abs/2012.02310 // https://arxiv.org/abs/2012.02310
optional float box_consistency_loss_weight = 16 [default=0.0]; optional float box_consistency_loss_weight = 16 [default = 0.0];
optional float color_consistency_threshold = 17 [default=0.4]; optional float color_consistency_threshold = 17 [default = 0.4];
optional int32 color_consistency_dilation = 18 [default=2]; optional int32 color_consistency_dilation = 18 [default = 2];
optional float color_consistency_loss_weight = 19 [default=0.0]; optional float color_consistency_loss_weight = 19 [default = 0.0];
optional LossNormalize box_consistency_loss_normalize = 20 [ optional LossNormalize box_consistency_loss_normalize = 20
default=NORMALIZE_AUTO]; [default = NORMALIZE_AUTO];
// If set, will use the bounding box tightness prior approach. This means // If set, will use the bounding box tightness prior approach. This means
// that the max will be restricted to only be inside the box for both // that the max will be restricted to only be inside the box for both
// dimensions. See details here: // dimensions. See details here:
// https://papers.nips.cc/paper/2019/hash/e6e713296627dff6475085cc6a224464-Abstract.html // https://papers.nips.cc/paper/2019/hash/e6e713296627dff6475085cc6a224464-Abstract.html
optional bool box_consistency_tightness = 21 [default=false]; optional bool box_consistency_tightness = 21 [default = false];
optional int32 color_consistency_warmup_steps = 22 [default=0]; optional int32 color_consistency_warmup_steps = 22 [default = 0];
optional int32 color_consistency_warmup_start = 23 [default=0]; optional int32 color_consistency_warmup_start = 23 [default = 0];
// This flag controls whether or not we use the outputs from only the
// last stage of the hourglass for training the mask-heads.
// DeepMAC has been refactored to process the entire batch at once, // DeepMAC has been refactored to process the entire batch at once,
// instead of the previous (simple) approach of processing one sample at // instead of the previous (simple) approach of processing one sample at
// a time. Because of this, the memory consumption has increased and // a time. Because of this, we need to set this flag to continue using
// it's crucial to only feed the mask head the last stage outputs // the old models with the same training hardware.
// from the hourglass. Doing so halves the memory requirement of the
// mask head and does not cause a drop in evaluation metrics. // This flag is not needed for 1024x1024 models. The performance and
optional bool use_only_last_stage = 24 [default=false]; // memory usage are same as before.
// For 512x512 models
// - Setting this flag to true will let the model train on TPU-v3 32
// chips. We observed a small (0.26 mAP) performance drop when doing so.
// - Setting this flag to false (default) increases the TPU requirement
// to TPU-v3 128 and reproduces previously demonstrated performance
// within error bars.
optional bool use_only_last_stage = 24 [default = false];
} }
optional DeepMACMaskEstimation deepmac_mask_estimation = 14; optional DeepMACMaskEstimation deepmac_mask_estimation = 14;
...@@ -506,7 +517,7 @@ message CenterNet { ...@@ -506,7 +517,7 @@ message CenterNet {
} }
enum LossNormalize { enum LossNormalize {
NORMALIZE_AUTO = 0; // SUM for 2D inputs (dice loss) and MEAN for others. NORMALIZE_AUTO = 0; // SUM for 2D inputs (dice loss) and MEAN for others.
NORMALIZE_GROUNDTRUTH_COUNT = 1; NORMALIZE_GROUNDTRUTH_COUNT = 1;
NORMALIZE_BALANCED = 3; NORMALIZE_BALANCED = 3;
} }
...@@ -547,4 +558,3 @@ message CenterNetFeatureExtractor { ...@@ -547,4 +558,3 @@ message CenterNetFeatureExtractor {
optional string upsampling_interpolation = 11 [default = 'nearest']; optional string upsampling_interpolation = 11 [default = 'nearest'];
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment