heads.py 39 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Classes to build various prediction heads in all supported models."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

Yeqing Li's avatar
Yeqing Li committed
21
import functools
22
23

import numpy as np
24
import tensorflow as tf
25
26

from official.vision.detection.modeling.architecture import keras_utils
27
from official.vision.detection.modeling.architecture import nn_ops
28
from official.vision.detection.ops import spatial_transform_ops
29
30


Yeqing Li's avatar
Yeqing Li committed
31
class RpnHead(tf.keras.layers.Layer):
32
33
  """Region Proposal Network head."""

Hongkun Yu's avatar
Hongkun Yu committed
34
35
36
37
38
39
40
41
42
43
44
  def __init__(
      self,
      min_level,
      max_level,
      anchors_per_location,
      num_convs=2,
      num_filters=256,
      use_separable_conv=False,
      activation='relu',
      use_batch_norm=True,
      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
45
46
47
48
49
50
51
    """Initialize params to build Region Proposal Network head.

    Args:
      min_level: `int` number of minimum feature level.
      max_level: `int` number of maximum feature level.
      anchors_per_location: `int` number of number of anchors per pixel
        location.
Yeqing Li's avatar
Yeqing Li committed
52
53
54
55
56
57
      num_convs: `int` number that represents the number of the intermediate
        conv layers before the prediction.
      num_filters: `int` number that represents the number of filters of the
        intermediate conv layers.
      use_separable_conv: `bool`, indicating whether the separable conv layers
        is used.
58
      activation: activation function. Support 'relu' and 'swish'.
Yeqing Li's avatar
Yeqing Li committed
59
      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
Hongkun Yu's avatar
Hongkun Yu committed
60
61
      norm_activation: an operation that includes a normalization layer followed
        by an optional activation layer.
62
63
64
65
    """
    self._min_level = min_level
    self._max_level = max_level
    self._anchors_per_location = anchors_per_location
Pengchong Jin's avatar
Pengchong Jin committed
66
67
68
69
70
71
    if activation == 'relu':
      self._activation_op = tf.nn.relu
    elif activation == 'swish':
      self._activation_op = tf.nn.swish
    else:
      raise ValueError('Unsupported activation `{}`.'.format(activation))
Yeqing Li's avatar
Yeqing Li committed
72
73
74
75
76
77
78
79
80
81
82
83
84
    self._use_batch_norm = use_batch_norm

    if use_separable_conv:
      self._conv2d_op = functools.partial(
          tf.keras.layers.SeparableConv2D,
          depth_multiplier=1,
          bias_initializer=tf.zeros_initializer())
    else:
      self._conv2d_op = functools.partial(
          tf.keras.layers.Conv2D,
          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
          bias_initializer=tf.zeros_initializer())

Yeqing Li's avatar
Yeqing Li committed
85
    self._rpn_conv = self._conv2d_op(
Yeqing Li's avatar
Yeqing Li committed
86
        num_filters,
87
88
        kernel_size=(3, 3),
        strides=(1, 1),
Pengchong Jin's avatar
Pengchong Jin committed
89
        activation=(None if self._use_batch_norm else self._activation_op),
90
91
        padding='same',
        name='rpn')
Yeqing Li's avatar
Yeqing Li committed
92
    self._rpn_class_conv = self._conv2d_op(
93
94
95
96
97
        anchors_per_location,
        kernel_size=(1, 1),
        strides=(1, 1),
        padding='valid',
        name='rpn-class')
Yeqing Li's avatar
Yeqing Li committed
98
    self._rpn_box_conv = self._conv2d_op(
99
100
101
102
103
        4 * anchors_per_location,
        kernel_size=(1, 1),
        strides=(1, 1),
        padding='valid',
        name='rpn-box')
Yeqing Li's avatar
Yeqing Li committed
104

Pengchong Jin's avatar
Pengchong Jin committed
105
    self._norm_activations = {}
Yeqing Li's avatar
Yeqing Li committed
106
107
    if self._use_batch_norm:
      for level in range(self._min_level, self._max_level + 1):
Pengchong Jin's avatar
Pengchong Jin committed
108
        self._norm_activations[level] = norm_activation(name='rpn-l%d-bn' %
Yeqing Li's avatar
Yeqing Li committed
109
                                                        level)
110
111
112
113
114

  def _shared_rpn_heads(self, features, anchors_per_location, level,
                        is_training):
    """Shared RPN heads."""
    features = self._rpn_conv(features)
Yeqing Li's avatar
Yeqing Li committed
115
116
    if self._use_batch_norm:
      # The batch normalization layers are not shared between levels.
Pengchong Jin's avatar
Pengchong Jin committed
117
      features = self._norm_activations[level](
Yeqing Li's avatar
Yeqing Li committed
118
          features, is_training=is_training)
119
120
121
122
123
124
125
126
127
128
129
130
    # Proposal classification scores
    scores = self._rpn_class_conv(features)
    # Proposal bbox regression deltas
    bboxes = self._rpn_box_conv(features)

    return scores, bboxes

  def __call__(self, features, is_training=None):

    scores_outputs = {}
    box_outputs = {}

131
    with keras_utils.maybe_enter_backend_graph(), tf.name_scope('rpn_head'):
132
133
134
135
136
137
138
139
      for level in range(self._min_level, self._max_level + 1):
        scores_output, box_output = self._shared_rpn_heads(
            features[level], self._anchors_per_location, level, is_training)
        scores_outputs[level] = scores_output
        box_outputs[level] = box_output
      return scores_outputs, box_outputs


Yeqing Li's avatar
Yeqing Li committed
140
class FastrcnnHead(tf.keras.layers.Layer):
141
142
  """Fast R-CNN box head."""

Hongkun Yu's avatar
Hongkun Yu committed
143
144
145
146
147
148
149
150
151
152
153
  def __init__(
      self,
      num_classes,
      num_convs=0,
      num_filters=256,
      use_separable_conv=False,
      num_fcs=2,
      fc_dims=1024,
      activation='relu',
      use_batch_norm=True,
      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
154
155
156
157
    """Initialize params to build Fast R-CNN box head.

    Args:
      num_classes: a integer for the number of classes.
Yeqing Li's avatar
Yeqing Li committed
158
159
160
161
162
163
164
165
166
167
      num_convs: `int` number that represents the number of the intermediate
        conv layers before the FC layers.
      num_filters: `int` number that represents the number of filters of the
        intermediate conv layers.
      use_separable_conv: `bool`, indicating whether the separable conv layers
        is used.
      num_fcs: `int` number that represents the number of FC layers before the
        predictions.
      fc_dims: `int` number that represents the number of dimension of the FC
        layers.
168
      activation: activation function. Support 'relu' and 'swish'.
Yeqing Li's avatar
Yeqing Li committed
169
      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
Hongkun Yu's avatar
Hongkun Yu committed
170
171
      norm_activation: an operation that includes a normalization layer followed
        by an optional activation layer.
172
173
    """
    self._num_classes = num_classes
Yeqing Li's avatar
Yeqing Li committed
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

    self._num_convs = num_convs
    self._num_filters = num_filters
    if use_separable_conv:
      self._conv2d_op = functools.partial(
          tf.keras.layers.SeparableConv2D,
          depth_multiplier=1,
          bias_initializer=tf.zeros_initializer())
    else:
      self._conv2d_op = functools.partial(
          tf.keras.layers.Conv2D,
          kernel_initializer=tf.keras.initializers.VarianceScaling(
              scale=2, mode='fan_out', distribution='untruncated_normal'),
          bias_initializer=tf.zeros_initializer())

    self._num_fcs = num_fcs
    self._fc_dims = fc_dims
Pengchong Jin's avatar
Pengchong Jin committed
191
192
193
194
195
196
    if activation == 'relu':
      self._activation_op = tf.nn.relu
    elif activation == 'swish':
      self._activation_op = tf.nn.swish
    else:
      raise ValueError('Unsupported activation `{}`.'.format(activation))
Yeqing Li's avatar
Yeqing Li committed
197
    self._use_batch_norm = use_batch_norm
Pengchong Jin's avatar
Pengchong Jin committed
198
    self._norm_activation = norm_activation
199

Yeqing Li's avatar
Yeqing Li committed
200
201
202
203
204
205
206
207
208
209
    self._conv_ops = []
    self._conv_bn_ops = []
    for i in range(self._num_convs):
      self._conv_ops.append(
          self._conv2d_op(
              self._num_filters,
              kernel_size=(3, 3),
              strides=(1, 1),
              padding='same',
              dilation_rate=(1, 1),
Hongkun Yu's avatar
Hongkun Yu committed
210
211
              activation=(None
                          if self._use_batch_norm else self._activation_op),
Yeqing Li's avatar
Yeqing Li committed
212
213
              name='conv_{}'.format(i)))
      if self._use_batch_norm:
Pengchong Jin's avatar
Pengchong Jin committed
214
        self._conv_bn_ops.append(self._norm_activation())
Yeqing Li's avatar
Yeqing Li committed
215
216
217
218
219
220
221

    self._fc_ops = []
    self._fc_bn_ops = []
    for i in range(self._num_fcs):
      self._fc_ops.append(
          tf.keras.layers.Dense(
              units=self._fc_dims,
Hongkun Yu's avatar
Hongkun Yu committed
222
223
              activation=(None
                          if self._use_batch_norm else self._activation_op),
Yeqing Li's avatar
Yeqing Li committed
224
225
              name='fc{}'.format(i)))
      if self._use_batch_norm:
Pengchong Jin's avatar
Pengchong Jin committed
226
        self._fc_bn_ops.append(self._norm_activation(fused=False))
Yeqing Li's avatar
Yeqing Li committed
227
228
229
230
231
232
233
234
235
236
237
238

    self._class_predict = tf.keras.layers.Dense(
        self._num_classes,
        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
        bias_initializer=tf.zeros_initializer(),
        name='class-predict')
    self._box_predict = tf.keras.layers.Dense(
        self._num_classes * 4,
        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.001),
        bias_initializer=tf.zeros_initializer(),
        name='box-predict')

239
240
241
242
  def __call__(self, roi_features, is_training=None):
    """Box and class branches for the Mask-RCNN model.

    Args:
Hongkun Yu's avatar
Hongkun Yu committed
243
244
      roi_features: A ROI feature tensor of shape [batch_size, num_rois,
        height_l, width_l, num_filters].
245
246
247
248
249
250
251
252
253
254
      is_training: `boolean`, if True if model is in training mode.

    Returns:
      class_outputs: a tensor with a shape of
        [batch_size, num_rois, num_classes], representing the class predictions.
      box_outputs: a tensor with a shape of
        [batch_size, num_rois, num_classes * 4], representing the box
        predictions.
    """

255
256
    with keras_utils.maybe_enter_backend_graph(), tf.name_scope(
        'fast_rcnn_head'):
257
258
      # reshape inputs beofre FC.
      _, num_rois, height, width, filters = roi_features.get_shape().as_list()
Yeqing Li's avatar
Yeqing Li committed
259
260
261

      net = tf.reshape(roi_features, [-1, height, width, filters])
      for i in range(self._num_convs):
Yeqing Li's avatar
Yeqing Li committed
262
        net = self._conv_ops[i](net)
Yeqing Li's avatar
Yeqing Li committed
263
        if self._use_batch_norm:
Yeqing Li's avatar
Yeqing Li committed
264
          net = self._conv_bn_ops[i](net, is_training=is_training)
Yeqing Li's avatar
Yeqing Li committed
265
266
267
268
269

      filters = self._num_filters if self._num_convs > 0 else filters
      net = tf.reshape(net, [-1, num_rois, height * width * filters])

      for i in range(self._num_fcs):
Yeqing Li's avatar
Yeqing Li committed
270
        net = self._fc_ops[i](net)
Yeqing Li's avatar
Yeqing Li committed
271
        if self._use_batch_norm:
Yeqing Li's avatar
Yeqing Li committed
272
          net = self._fc_bn_ops[i](net, is_training=is_training)
273

Yeqing Li's avatar
Yeqing Li committed
274
275
      class_outputs = self._class_predict(net)
      box_outputs = self._box_predict(net)
276
277
278
      return class_outputs, box_outputs


Yeqing Li's avatar
Yeqing Li committed
279
class MaskrcnnHead(tf.keras.layers.Layer):
280
281
  """Mask R-CNN head."""

Hongkun Yu's avatar
Hongkun Yu committed
282
283
284
285
286
287
288
289
290
291
  def __init__(
      self,
      num_classes,
      mask_target_size,
      num_convs=4,
      num_filters=256,
      use_separable_conv=False,
      activation='relu',
      use_batch_norm=True,
      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
292
293
294
295
    """Initialize params to build Fast R-CNN head.

    Args:
      num_classes: a integer for the number of classes.
Pengchong Jin's avatar
Pengchong Jin committed
296
      mask_target_size: a integer that is the resolution of masks.
Yeqing Li's avatar
Yeqing Li committed
297
298
299
300
301
302
      num_convs: `int` number that represents the number of the intermediate
        conv layers before the prediction.
      num_filters: `int` number that represents the number of filters of the
        intermediate conv layers.
      use_separable_conv: `bool`, indicating whether the separable conv layers
        is used.
303
      activation: activation function. Support 'relu' and 'swish'.
Yeqing Li's avatar
Yeqing Li committed
304
      use_batch_norm: 'bool', indicating whether batchnorm layers are added.
Hongkun Yu's avatar
Hongkun Yu committed
305
306
      norm_activation: an operation that includes a normalization layer followed
        by an optional activation layer.
307
308
    """
    self._num_classes = num_classes
Pengchong Jin's avatar
Pengchong Jin committed
309
    self._mask_target_size = mask_target_size
Yeqing Li's avatar
Yeqing Li committed
310
311
312
313
314
315
316
317
318
319
320
321
322
323

    self._num_convs = num_convs
    self._num_filters = num_filters
    if use_separable_conv:
      self._conv2d_op = functools.partial(
          tf.keras.layers.SeparableConv2D,
          depth_multiplier=1,
          bias_initializer=tf.zeros_initializer())
    else:
      self._conv2d_op = functools.partial(
          tf.keras.layers.Conv2D,
          kernel_initializer=tf.keras.initializers.VarianceScaling(
              scale=2, mode='fan_out', distribution='untruncated_normal'),
          bias_initializer=tf.zeros_initializer())
Pengchong Jin's avatar
Pengchong Jin committed
324
325
326
327
328
329
    if activation == 'relu':
      self._activation_op = tf.nn.relu
    elif activation == 'swish':
      self._activation_op = tf.nn.swish
    else:
      raise ValueError('Unsupported activation `{}`.'.format(activation))
Yeqing Li's avatar
Yeqing Li committed
330
    self._use_batch_norm = use_batch_norm
Pengchong Jin's avatar
Pengchong Jin committed
331
    self._norm_activation = norm_activation
Yeqing Li's avatar
Yeqing Li committed
332
333
334
335
336
337
338
339
340
    self._conv2d_ops = []
    for i in range(self._num_convs):
      self._conv2d_ops.append(
          self._conv2d_op(
              self._num_filters,
              kernel_size=(3, 3),
              strides=(1, 1),
              padding='same',
              dilation_rate=(1, 1),
Hongkun Yu's avatar
Hongkun Yu committed
341
342
              activation=(None
                          if self._use_batch_norm else self._activation_op),
Yeqing Li's avatar
Yeqing Li committed
343
344
345
346
347
348
              name='mask-conv-l%d' % i))
    self._mask_conv_transpose = tf.keras.layers.Conv2DTranspose(
        self._num_filters,
        kernel_size=(2, 2),
        strides=(2, 2),
        padding='valid',
Pengchong Jin's avatar
Pengchong Jin committed
349
        activation=(None if self._use_batch_norm else self._activation_op),
Yeqing Li's avatar
Yeqing Li committed
350
351
352
353
        kernel_initializer=tf.keras.initializers.VarianceScaling(
            scale=2, mode='fan_out', distribution='untruncated_normal'),
        bias_initializer=tf.zeros_initializer(),
        name='conv5-mask')
354
355
356
357
358

  def __call__(self, roi_features, class_indices, is_training=None):
    """Mask branch for the Mask-RCNN model.

    Args:
Hongkun Yu's avatar
Hongkun Yu committed
359
360
361
362
      roi_features: A ROI feature tensor of shape [batch_size, num_rois,
        height_l, width_l, num_filters].
      class_indices: a Tensor of shape [batch_size, num_rois], indicating which
        class the ROI is.
363
      is_training: `boolean`, if True if model is in training mode.
Yeqing Li's avatar
Yeqing Li committed
364

365
366
367
368
369
370
371
372
373
374
375
    Returns:
      mask_outputs: a tensor with a shape of
        [batch_size, num_masks, mask_height, mask_width, num_classes],
        representing the mask predictions.
      fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2],
        representing the fg mask targets.
    Raises:
      ValueError: If boxes is not a rank-3 tensor or the last dimension of
        boxes is not 4.
    """

376
    with keras_utils.maybe_enter_backend_graph():
377
378
379
380
      with tf.name_scope('mask_head'):
        _, num_rois, height, width, filters = roi_features.get_shape().as_list()
        net = tf.reshape(roi_features, [-1, height, width, filters])

Yeqing Li's avatar
Yeqing Li committed
381
        for i in range(self._num_convs):
Yeqing Li's avatar
Yeqing Li committed
382
          net = self._conv2d_ops[i](net)
Yeqing Li's avatar
Yeqing Li committed
383
          if self._use_batch_norm:
Pengchong Jin's avatar
Pengchong Jin committed
384
            net = self._norm_activation()(net, is_training=is_training)
385

Yeqing Li's avatar
Yeqing Li committed
386
        net = self._mask_conv_transpose(net)
Yeqing Li's avatar
Yeqing Li committed
387
        if self._use_batch_norm:
Pengchong Jin's avatar
Pengchong Jin committed
388
          net = self._norm_activation()(net, is_training=is_training)
Yeqing Li's avatar
Yeqing Li committed
389
390
391
392

        mask_outputs = self._conv2d_op(
            self._num_classes,
            kernel_size=(1, 1),
393
394
395
396
397
            strides=(1, 1),
            padding='valid',
            name='mask_fcn_logits')(
                net)
        mask_outputs = tf.reshape(mask_outputs, [
Pengchong Jin's avatar
Pengchong Jin committed
398
            -1, num_rois, self._mask_target_size, self._mask_target_size,
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
            self._num_classes
        ])

        with tf.name_scope('masks_post_processing'):
          # TODO(pengchong): Figure out the way not to use the static inferred
          # batch size.
          batch_size, num_masks = class_indices.get_shape().as_list()
          mask_outputs = tf.transpose(a=mask_outputs, perm=[0, 1, 4, 2, 3])
          # Contructs indices for gather.
          batch_indices = tf.tile(
              tf.expand_dims(tf.range(batch_size), axis=1), [1, num_masks])
          mask_indices = tf.tile(
              tf.expand_dims(tf.range(num_masks), axis=0), [batch_size, 1])
          gather_indices = tf.stack(
              [batch_indices, mask_indices, class_indices], axis=2)
          mask_outputs = tf.gather_nd(mask_outputs, gather_indices)
      return mask_outputs


class RetinanetHead(object):
  """RetinaNet head."""

Hongkun Yu's avatar
Hongkun Yu committed
421
422
423
424
425
426
427
428
429
430
  def __init__(
      self,
      min_level,
      max_level,
      num_classes,
      anchors_per_location,
      num_convs=4,
      num_filters=256,
      use_separable_conv=False,
      norm_activation=nn_ops.norm_activation_builder(activation='relu')):
431
432
433
434
435
436
437
438
439
440
    """Initialize params to build RetinaNet head.

    Args:
      min_level: `int` number of minimum feature level.
      max_level: `int` number of maximum feature level.
      num_classes: `int` number of classification categories.
      anchors_per_location: `int` number of anchors per pixel location.
      num_convs: `int` number of stacked convolution before the last prediction
        layer.
      num_filters: `int` number of filters used in the head architecture.
441
442
      use_separable_conv: `bool` to indicate whether to use separable
        convoluation.
Hongkun Yu's avatar
Hongkun Yu committed
443
444
      norm_activation: an operation that includes a normalization layer followed
        by an optional activation layer.
445
446
447
448
449
450
451
452
453
    """
    self._min_level = min_level
    self._max_level = max_level

    self._num_classes = num_classes
    self._anchors_per_location = anchors_per_location

    self._num_convs = num_convs
    self._num_filters = num_filters
454
    self._use_separable_conv = use_separable_conv
455
456
457
458
    with tf.name_scope('class_net') as scope_name:
      self._class_name_scope = tf.name_scope(scope_name)
    with tf.name_scope('box_net') as scope_name:
      self._box_name_scope = tf.name_scope(scope_name)
Pengchong Jin's avatar
Pengchong Jin committed
459
460
    self._build_class_net_layers(norm_activation)
    self._build_box_net_layers(norm_activation)
461
462
463
464
465
466
467

  def _class_net_batch_norm_name(self, i, level):
    return 'class-%d-%d' % (i, level)

  def _box_net_batch_norm_name(self, i, level):
    return 'box-%d-%d' % (i, level)

Pengchong Jin's avatar
Pengchong Jin committed
468
  def _build_class_net_layers(self, norm_activation):
469
    """Build re-usable layers for class prediction network."""
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
    if self._use_separable_conv:
      self._class_predict = tf.keras.layers.SeparableConv2D(
          self._num_classes * self._anchors_per_location,
          kernel_size=(3, 3),
          bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
          padding='same',
          name='class-predict')
    else:
      self._class_predict = tf.keras.layers.Conv2D(
          self._num_classes * self._anchors_per_location,
          kernel_size=(3, 3),
          bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1e-5),
          padding='same',
          name='class-predict')
485
    self._class_conv = []
Pengchong Jin's avatar
Pengchong Jin committed
486
    self._class_norm_activation = {}
487
    for i in range(self._num_convs):
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
      if self._use_separable_conv:
        self._class_conv.append(
            tf.keras.layers.SeparableConv2D(
                self._num_filters,
                kernel_size=(3, 3),
                bias_initializer=tf.zeros_initializer(),
                activation=None,
                padding='same',
                name='class-' + str(i)))
      else:
        self._class_conv.append(
            tf.keras.layers.Conv2D(
                self._num_filters,
                kernel_size=(3, 3),
                bias_initializer=tf.zeros_initializer(),
                kernel_initializer=tf.keras.initializers.RandomNormal(
                    stddev=0.01),
                activation=None,
                padding='same',
                name='class-' + str(i)))
508
509
      for level in range(self._min_level, self._max_level + 1):
        name = self._class_net_batch_norm_name(i, level)
Pengchong Jin's avatar
Pengchong Jin committed
510
        self._class_norm_activation[name] = norm_activation(name=name)
511

Pengchong Jin's avatar
Pengchong Jin committed
512
  def _build_box_net_layers(self, norm_activation):
513
    """Build re-usable layers for box prediction network."""
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
    if self._use_separable_conv:
      self._box_predict = tf.keras.layers.SeparableConv2D(
          4 * self._anchors_per_location,
          kernel_size=(3, 3),
          bias_initializer=tf.zeros_initializer(),
          padding='same',
          name='box-predict')
    else:
      self._box_predict = tf.keras.layers.Conv2D(
          4 * self._anchors_per_location,
          kernel_size=(3, 3),
          bias_initializer=tf.zeros_initializer(),
          kernel_initializer=tf.keras.initializers.RandomNormal(stddev=1e-5),
          padding='same',
          name='box-predict')
529
    self._box_conv = []
Pengchong Jin's avatar
Pengchong Jin committed
530
    self._box_norm_activation = {}
531
    for i in range(self._num_convs):
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
      if self._use_separable_conv:
        self._box_conv.append(
            tf.keras.layers.SeparableConv2D(
                self._num_filters,
                kernel_size=(3, 3),
                activation=None,
                bias_initializer=tf.zeros_initializer(),
                padding='same',
                name='box-' + str(i)))
      else:
        self._box_conv.append(
            tf.keras.layers.Conv2D(
                self._num_filters,
                kernel_size=(3, 3),
                activation=None,
                bias_initializer=tf.zeros_initializer(),
                kernel_initializer=tf.keras.initializers.RandomNormal(
                    stddev=0.01),
                padding='same',
                name='box-' + str(i)))
552
553
      for level in range(self._min_level, self._max_level + 1):
        name = self._box_net_batch_norm_name(i, level)
Pengchong Jin's avatar
Pengchong Jin committed
554
        self._box_norm_activation[name] = norm_activation(name=name)
555
556
557
558
559

  def __call__(self, fpn_features, is_training=None):
    """Returns outputs of RetinaNet head."""
    class_outputs = {}
    box_outputs = {}
560
561
    with keras_utils.maybe_enter_backend_graph(), tf.name_scope(
        'retinanet_head'):
562
563
564
565
566
567
568
569
570
571
572
573
574
575
      for level in range(self._min_level, self._max_level + 1):
        features = fpn_features[level]

        class_outputs[level] = self.class_net(
            features, level, is_training=is_training)
        box_outputs[level] = self.box_net(
            features, level, is_training=is_training)
    return class_outputs, box_outputs

  def class_net(self, features, level, is_training):
    """Class prediction network for RetinaNet."""
    with self._class_name_scope:
      for i in range(self._num_convs):
        features = self._class_conv[i](features)
576
577
        # The convolution layers in the class net are shared among all levels,
        # but each level has its batch normlization to capture the statistical
578
579
        # difference among different levels.
        name = self._class_net_batch_norm_name(i, level)
Pengchong Jin's avatar
Pengchong Jin committed
580
        features = self._class_norm_activation[name](
581
582
583
584
585
586
587
588
589
590
591
592
593
594
            features, is_training=is_training)

      classes = self._class_predict(features)
    return classes

  def box_net(self, features, level, is_training=None):
    """Box regression network for RetinaNet."""
    with self._box_name_scope:
      for i in range(self._num_convs):
        features = self._box_conv[i](features)
        # The convolution layers in the box net are shared among all levels, but
        # each level has its batch normlization to capture the statistical
        # difference among different levels.
        name = self._box_net_batch_norm_name(i, level)
Pengchong Jin's avatar
Pengchong Jin committed
595
        features = self._box_norm_activation[name](
596
597
598
599
600
601
602
603
604
605
            features, is_training=is_training)

      boxes = self._box_predict(features)
    return boxes


# TODO(yeqing): Refactor this class when it is ready for var_scope reuse.
class ShapemaskPriorHead(object):
  """ShapeMask Prior head."""

Hongkun Yu's avatar
Hongkun Yu committed
606
607
  def __init__(self, num_classes, num_downsample_channels, mask_crop_size,
               use_category_for_mask, shape_prior_path):
608
609
610
611
612
613
614
615
616
    """Initialize params to build RetinaNet head.

    Args:
      num_classes: Number of output classes.
      num_downsample_channels: number of channels in mask branch.
      mask_crop_size: feature crop size.
      use_category_for_mask: use class information in mask branch.
      shape_prior_path: the path to load shape priors.
    """
617
    self._mask_num_classes = num_classes if use_category_for_mask else 1
618
619
620
    self._num_downsample_channels = num_downsample_channels
    self._mask_crop_size = mask_crop_size
    self._shape_prior_path = shape_prior_path
621
622
623
624
    self._use_category_for_mask = use_category_for_mask

    self._shape_prior_fc = tf.keras.layers.Dense(
        self._num_downsample_channels, name='shape-prior-fc')
625

626
  def __call__(self, fpn_features, boxes, outer_boxes, classes, is_training):
627
628
629
630
631
632
633
    """Generate the detection priors from the box detections and FPN features.

    This corresponds to the Fig. 4 of the ShapeMask paper at
    https://arxiv.org/pdf/1904.03239.pdf

    Args:
      fpn_features: a dictionary of FPN features.
Hongkun Yu's avatar
Hongkun Yu committed
634
635
      boxes: a float tensor of shape [batch_size, num_instances, 4] representing
        the tight gt boxes from dataloader/detection.
636
637
      outer_boxes: a float tensor of shape [batch_size, num_instances, 4]
        representing the loose gt boxes from dataloader/detection.
Hongkun Yu's avatar
Hongkun Yu committed
638
639
      classes: a int Tensor of shape [batch_size, num_instances] of instance
        classes.
640
641
642
      is_training: training mode or not.

    Returns:
643
      instance_features: a float Tensor of shape [batch_size * num_instances,
644
645
646
647
648
          mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
          instance feature crop.
      detection_priors: A float Tensor of shape [batch_size * num_instances,
        mask_size, mask_size, 1].
    """
649
    with keras_utils.maybe_enter_backend_graph(), tf.name_scope('prior_mask'):
650
651
652
653
654
655
656
657
658
659
      batch_size, num_instances, _ = boxes.get_shape().as_list()
      outer_boxes = tf.cast(outer_boxes, tf.float32)
      boxes = tf.cast(boxes, tf.float32)
      instance_features = spatial_transform_ops.multilevel_crop_and_resize(
          fpn_features, outer_boxes, output_size=self._mask_crop_size)
      instance_features = self._shape_prior_fc(instance_features)

      shape_priors = self._get_priors()

      # Get uniform priors for each outer box.
Hongkun Yu's avatar
Hongkun Yu committed
660
661
662
      uniform_priors = tf.ones([
          batch_size, num_instances, self._mask_crop_size, self._mask_crop_size
      ])
663
      uniform_priors = spatial_transform_ops.crop_mask_in_target_box(
664
665
666
667
668
669
670
          uniform_priors, boxes, outer_boxes, self._mask_crop_size)

      # Classify shape priors using uniform priors + instance features.
      prior_distribution = self._classify_shape_priors(
          tf.cast(instance_features, tf.float32), uniform_priors, classes)

      instance_priors = tf.gather(shape_priors, classes)
Hongkun Yu's avatar
Hongkun Yu committed
671
672
673
      instance_priors *= tf.expand_dims(
          tf.expand_dims(tf.cast(prior_distribution, tf.float32), axis=-1),
          axis=-1)
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
      instance_priors = tf.reduce_sum(instance_priors, axis=2)
      detection_priors = spatial_transform_ops.crop_mask_in_target_box(
          instance_priors, boxes, outer_boxes, self._mask_crop_size)

      return instance_features, detection_priors

  def _get_priors(self):
    """Load shape priors from file."""
    # loads class specific or agnostic shape priors
    if self._shape_prior_path:
      # Priors are loaded into shape [mask_num_classes, num_clusters, 32, 32].
      priors = np.load(tf.io.gfile.GFile(self._shape_prior_path, 'rb'))
      priors = tf.convert_to_tensor(priors, dtype=tf.float32)
      self._num_clusters = priors.get_shape().as_list()[1]
    else:
      # If prior path does not exist, do not use priors, i.e., pirors equal to
      # uniform empty 32x32 patch.
      self._num_clusters = 1
Hongkun Yu's avatar
Hongkun Yu committed
692
693
694
695
      priors = tf.zeros([
          self._mask_num_classes, self._num_clusters, self._mask_crop_size,
          self._mask_crop_size
      ])
696
697
698
    return priors

  def _classify_shape_priors(self, features, uniform_priors, classes):
699
700
701
702
703
704
    """Classify the uniform prior by predicting the shape modes.

    Classify the object crop features into K modes of the clusters for each
    category.

    Args:
Hongkun Yu's avatar
Hongkun Yu committed
705
706
      features: A float Tensor of shape [batch_size, num_instances, mask_size,
        mask_size, num_channels].
707
708
      uniform_priors: A float Tensor of shape [batch_size, num_instances,
        mask_size, mask_size] representing the uniform detection priors.
Hongkun Yu's avatar
Hongkun Yu committed
709
710
      classes: A int Tensor of shape [batch_size, num_instances] of detection
        class ids.
711
712

    Returns:
713
714
      prior_distribution: A float Tensor of shape
        [batch_size, num_instances, num_clusters] representing the classifier
715
716
717
        output probability over all possible shapes.
    """

718
719
720
721
722
723
724
    batch_size, num_instances, _, _, _ = features.get_shape().as_list()
    features *= tf.expand_dims(uniform_priors, axis=-1)
    # Reduce spatial dimension of features. The features have shape
    # [batch_size, num_instances, num_channels].
    features = tf.reduce_mean(features, axis=(2, 3))
    logits = tf.keras.layers.Dense(
        self._mask_num_classes * self._num_clusters,
Hongkun Yu's avatar
Hongkun Yu committed
725
726
727
728
729
        kernel_initializer=tf.random_normal_initializer(stddev=0.01))(
            features)
    logits = tf.reshape(
        logits,
        [batch_size, num_instances, self._mask_num_classes, self._num_clusters])
730
    if self._use_category_for_mask:
731
732
      logits = tf.gather(logits, tf.expand_dims(classes, axis=-1), batch_dims=2)
      logits = tf.squeeze(logits, axis=2)
733
    else:
734
735
736
737
      logits = logits[:, :, 0, :]

    distribution = tf.nn.softmax(logits, name='shape_prior_weights')
    return distribution
738
739
740
741
742
743
744
745
746
747


class ShapemaskCoarsemaskHead(object):
  """ShapemaskCoarsemaskHead head."""

  def __init__(self,
               num_classes,
               num_downsample_channels,
               mask_crop_size,
               use_category_for_mask,
748
749
               num_convs,
               norm_activation=nn_ops.norm_activation_builder()):
750
751
752
753
754
755
756
757
758
    """Initialize params to build ShapeMask coarse and fine prediction head.

    Args:
      num_classes: `int` number of mask classification categories.
      num_downsample_channels: `int` number of filters at mask head.
      mask_crop_size: feature crop size.
      use_category_for_mask: use class information in mask branch.
      num_convs: `int` number of stacked convolution before the last prediction
        layer.
Hongkun Yu's avatar
Hongkun Yu committed
759
760
      norm_activation: an operation that includes a normalization layer followed
        by an optional activation layer.
761
    """
762
763
    self._mask_num_classes = num_classes if use_category_for_mask else 1
    self._use_category_for_mask = use_category_for_mask
764
765
766
    self._num_downsample_channels = num_downsample_channels
    self._mask_crop_size = mask_crop_size
    self._num_convs = num_convs
767
768
769
770
771
772
773
774
775
    self._norm_activation = norm_activation

    self._coarse_mask_fc = tf.keras.layers.Dense(
        self._num_downsample_channels, name='coarse-mask-fc')

    self._class_conv = []
    self._class_norm_activation = []

    for i in range(self._num_convs):
Hongkun Yu's avatar
Hongkun Yu committed
776
777
778
779
780
781
782
783
784
      self._class_conv.append(
          tf.keras.layers.Conv2D(
              self._num_downsample_channels,
              kernel_size=(3, 3),
              bias_initializer=tf.zeros_initializer(),
              kernel_initializer=tf.keras.initializers.RandomNormal(
                  stddev=0.01),
              padding='same',
              name='coarse-mask-class-%d' % i))
785
786
787
788
789
790
791
792
793
794
795
796
797
798

      self._class_norm_activation.append(
          norm_activation(name='coarse-mask-class-%d-bn' % i))

    self._class_predict = tf.keras.layers.Conv2D(
        self._mask_num_classes,
        kernel_size=(1, 1),
        # Focal loss bias initialization to have foreground 0.01 probability.
        bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
        padding='same',
        name='coarse-mask-class-predict')

  def __call__(self, features, detection_priors, classes, is_training):
799
800
801
802
803
804
    """Generate instance masks from FPN features and detection priors.

    This corresponds to the Fig. 5-6 of the ShapeMask paper at
    https://arxiv.org/pdf/1904.03239.pdf

    Args:
805
      features: a float Tensor of shape [batch_size, num_instances,
806
807
        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
        instance feature crop.
808
      detection_priors: a float Tensor of shape [batch_size, num_instances,
Hongkun Yu's avatar
Hongkun Yu committed
809
810
811
812
        mask_crop_size, mask_crop_size, 1]. This is the detection prior for the
        instance.
      classes: a int Tensor of shape [batch_size, num_instances] of instance
        classes.
813
814
815
816
      is_training: a bool indicating whether in training mode.

    Returns:
      mask_outputs: instance mask prediction as a float Tensor of shape
817
        [batch_size, num_instances, mask_size, mask_size].
818
    """
819
    with keras_utils.maybe_enter_backend_graph(), tf.name_scope('coarse_mask'):
820
821
822
823
824
825
826
827
828
      # Transform detection priors to have the same dimension as features.
      detection_priors = tf.expand_dims(detection_priors, axis=-1)
      detection_priors = self._coarse_mask_fc(detection_priors)

      features += detection_priors
      mask_logits = self.decoder_net(features, is_training)
      # Gather the logits with right input class.
      if self._use_category_for_mask:
        mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3])
Hongkun Yu's avatar
Hongkun Yu committed
829
830
        mask_logits = tf.gather(
            mask_logits, tf.expand_dims(classes, -1), batch_dims=2)
831
832
833
        mask_logits = tf.squeeze(mask_logits, axis=2)
      else:
        mask_logits = mask_logits[..., 0]
834

835
      return mask_logits
836

837
  def decoder_net(self, features, is_training=False):
838
839
840
    """Coarse mask decoder network architecture.

    Args:
841
      features: A tensor of size [batch, height_in, width_in, channels_in].
842
      is_training: Whether batch_norm layers are in training mode.
843

844
845
846
847
    Returns:
      images: A feature tensor of size [batch, output_size, output_size,
        num_channels]
    """
848
849
    (batch_size, num_instances, height, width,
     num_channels) = features.get_shape().as_list()
Hongkun Yu's avatar
Hongkun Yu committed
850
851
    features = tf.reshape(
        features, [batch_size * num_instances, height, width, num_channels])
852
    for i in range(self._num_convs):
853
      features = self._class_conv[i](features)
Hongkun Yu's avatar
Hongkun Yu committed
854
855
      features = self._class_norm_activation[i](
          features, is_training=is_training)
856

857
    mask_logits = self._class_predict(features)
Hongkun Yu's avatar
Hongkun Yu committed
858
859
860
    mask_logits = tf.reshape(
        mask_logits,
        [batch_size, num_instances, height, width, self._mask_num_classes])
861
    return mask_logits
862
863
864
865
866
867
868
869
870


class ShapemaskFinemaskHead(object):
  """ShapemaskFinemaskHead head."""

  def __init__(self,
               num_classes,
               num_downsample_channels,
               mask_crop_size,
871
               use_category_for_mask,
872
               num_convs,
873
               upsample_factor,
Pengchong Jin's avatar
Pengchong Jin committed
874
               norm_activation=nn_ops.norm_activation_builder()):
875
876
877
878
879
880
    """Initialize params to build ShapeMask coarse and fine prediction head.

    Args:
      num_classes: `int` number of mask classification categories.
      num_downsample_channels: `int` number of filters at mask head.
      mask_crop_size: feature crop size.
881
      use_category_for_mask: use class information in mask branch.
882
883
      num_convs: `int` number of stacked convolution before the last prediction
        layer.
884
      upsample_factor: `int` number of fine mask upsampling factor.
Pengchong Jin's avatar
Pengchong Jin committed
885
      norm_activation: an operation that includes a batch normalization layer
886
887
        followed by a relu layer(optional).
    """
888
889
    self._use_category_for_mask = use_category_for_mask
    self._mask_num_classes = num_classes if use_category_for_mask else 1
890
891
892
    self._num_downsample_channels = num_downsample_channels
    self._mask_crop_size = mask_crop_size
    self._num_convs = num_convs
893
894
895
896
    self.up_sample_factor = upsample_factor

    self._fine_mask_fc = tf.keras.layers.Dense(
        self._num_downsample_channels, name='fine-mask-fc')
897
898

    self._upsample_conv = tf.keras.layers.Conv2DTranspose(
899
900
901
902
903
        self._num_downsample_channels,
        (self.up_sample_factor, self.up_sample_factor),
        (self.up_sample_factor, self.up_sample_factor),
        name='fine-mask-conv2d-tran')

904
905
906
907
908
909
910
911
912
913
914
915
    self._fine_class_conv = []
    self._fine_class_bn = []
    for i in range(self._num_convs):
      self._fine_class_conv.append(
          tf.keras.layers.Conv2D(
              self._num_downsample_channels,
              kernel_size=(3, 3),
              bias_initializer=tf.zeros_initializer(),
              kernel_initializer=tf.keras.initializers.RandomNormal(
                  stddev=0.01),
              activation=None,
              padding='same',
916
              name='fine-mask-class-%d' % i))
Hongkun Yu's avatar
Hongkun Yu committed
917
918
      self._fine_class_bn.append(
          norm_activation(name='fine-mask-class-%d-bn' % i))
919
920
921
922
923
924
925
926
927

    self._class_predict_conv = tf.keras.layers.Conv2D(
        self._mask_num_classes,
        kernel_size=(1, 1),
        # Focal loss bias initialization to have foreground 0.01 probability.
        bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
        kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01),
        padding='same',
        name='fine-mask-class-predict')
928

929
  def __call__(self, features, mask_logits, classes, is_training):
930
931
932
933
934
935
    """Generate instance masks from FPN features and detection priors.

    This corresponds to the Fig. 5-6 of the ShapeMask paper at
    https://arxiv.org/pdf/1904.03239.pdf

    Args:
Hongkun Yu's avatar
Hongkun Yu committed
936
937
938
939
940
941
942
      features: a float Tensor of shape [batch_size, num_instances,
        mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
        instance feature crop.
      mask_logits: a float Tensor of shape [batch_size, num_instances,
        mask_crop_size, mask_crop_size] indicating predicted mask logits.
      classes: a int Tensor of shape [batch_size, num_instances] of instance
        classes.
943
944
945
946
      is_training: a bool indicating whether in training mode.

    Returns:
      mask_outputs: instance mask prediction as a float Tensor of shape
947
        [batch_size, num_instances, mask_size, mask_size].
948
    """
949
950
    # Extract the foreground mean features
    # with tf.variable_scope('fine_mask', reuse=tf.AUTO_REUSE):
951
    with keras_utils.maybe_enter_backend_graph(), tf.name_scope('fine_mask'):
952
953
954
      mask_probs = tf.nn.sigmoid(mask_logits)
      # Compute instance embedding for hard average.
      binary_mask = tf.cast(tf.greater(mask_probs, 0.5), features.dtype)
955
      instance_embedding = tf.reduce_sum(
956
957
958
          features * tf.expand_dims(binary_mask, axis=-1), axis=(2, 3))
      instance_embedding /= tf.expand_dims(
          tf.reduce_sum(binary_mask, axis=(2, 3)) + 1e-20, axis=-1)
959
      # Take the difference between crop features and mean instance features.
960
961
      features -= tf.expand_dims(
          tf.expand_dims(instance_embedding, axis=2), axis=2)
962

963
      features += self._fine_mask_fc(tf.expand_dims(mask_probs, axis=-1))
964

965
966
967
968
      # Decoder to generate upsampled segmentation mask.
      mask_logits = self.decoder_net(features, is_training)
      if self._use_category_for_mask:
        mask_logits = tf.transpose(mask_logits, [0, 1, 4, 2, 3])
Hongkun Yu's avatar
Hongkun Yu committed
969
970
        mask_logits = tf.gather(
            mask_logits, tf.expand_dims(classes, -1), batch_dims=2)
971
972
973
        mask_logits = tf.squeeze(mask_logits, axis=2)
      else:
        mask_logits = mask_logits[..., 0]
974

975
    return mask_logits
976

977
  def decoder_net(self, features, is_training=False):
978
979
980
    """Fine mask decoder network architecture.

    Args:
981
      features: A tensor of size [batch, height_in, width_in, channels_in].
982
983
984
985
986
987
988
      is_training: Whether batch_norm layers are in training mode.

    Returns:
      images: A feature tensor of size [batch, output_size, output_size,
        num_channels], where output size is self._gt_upsample_scale times
        that of input.
    """
989
990
    (batch_size, num_instances, height, width,
     num_channels) = features.get_shape().as_list()
Hongkun Yu's avatar
Hongkun Yu committed
991
992
    features = tf.reshape(
        features, [batch_size * num_instances, height, width, num_channels])
993
    for i in range(self._num_convs):
994
995
996
997
998
      features = self._fine_class_conv[i](features)
      features = self._fine_class_bn[i](features, is_training=is_training)

    if self.up_sample_factor > 1:
      features = self._upsample_conv(features)
999

1000
1001
    # Predict per-class instance masks.
    mask_logits = self._class_predict_conv(features)
1002

Hongkun Yu's avatar
Hongkun Yu committed
1003
1004
1005
1006
    mask_logits = tf.reshape(mask_logits, [
        batch_size, num_instances, height * self.up_sample_factor,
        width * self.up_sample_factor, self._mask_num_classes
    ])
1007
    return mask_logits