"vscode:/vscode.git/clone" did not exist on "bfb35d0929c6e0b678a1a11a27357ca5c2cebc53"
Commit 472e2f80 authored by zhanggzh's avatar zhanggzh
Browse files

Merge remote-tracking branch 'tf_model/main'

parents d91296eb f3a14f85
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model defination for the RetinaNet Model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.legacy.detection.dataloader import mode_keys
from official.legacy.detection.evaluation import factory as eval_factory
from official.legacy.detection.modeling import base_model
from official.legacy.detection.modeling import losses
from official.legacy.detection.modeling.architecture import factory
from official.legacy.detection.ops import postprocess_ops
class RetinanetModel(base_model.Model):
"""RetinaNet model function."""
def __init__(self, params):
super(RetinanetModel, self).__init__(params)
# For eval metrics.
self._params = params
# Architecture generators.
self._backbone_fn = factory.backbone_generator(params)
self._fpn_fn = factory.multilevel_features_generator(params)
self._head_fn = factory.retinanet_head_generator(params)
# Loss function.
self._cls_loss_fn = losses.RetinanetClassLoss(
params.retinanet_loss, params.architecture.num_classes)
self._box_loss_fn = losses.RetinanetBoxLoss(params.retinanet_loss)
self._box_loss_weight = params.retinanet_loss.box_loss_weight
self._keras_model = None
# Predict function.
self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
params.architecture.min_level, params.architecture.max_level,
params.postprocess)
self._transpose_input = params.train.transpose_input
assert not self._transpose_input, 'Transpose input is not supported.'
# Input layer.
self._input_layer = tf.keras.layers.Input(
shape=(None, None, params.retinanet_parser.num_channels),
name='',
dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32)
def build_outputs(self, inputs, mode):
# If the input image is transposed (from NHWC to HWCN), we need to revert it
# back to the original shape before it's used in the computation.
if self._transpose_input:
inputs = tf.transpose(inputs, [3, 0, 1, 2])
backbone_features = self._backbone_fn(
inputs, is_training=(mode == mode_keys.TRAIN))
fpn_features = self._fpn_fn(
backbone_features, is_training=(mode == mode_keys.TRAIN))
cls_outputs, box_outputs = self._head_fn(
fpn_features, is_training=(mode == mode_keys.TRAIN))
if self._use_bfloat16:
levels = cls_outputs.keys()
for level in levels:
cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
model_outputs = {
'cls_outputs': cls_outputs,
'box_outputs': box_outputs,
}
return model_outputs
def build_loss_fn(self):
if self._keras_model is None:
raise ValueError('build_loss_fn() must be called after build_model().')
filter_fn = self.make_filter_trainable_variables_fn()
trainable_variables = filter_fn(self._keras_model.trainable_variables)
def _total_loss_fn(labels, outputs):
cls_loss = self._cls_loss_fn(outputs['cls_outputs'],
labels['cls_targets'],
labels['num_positives'])
box_loss = self._box_loss_fn(outputs['box_outputs'],
labels['box_targets'],
labels['num_positives'])
model_loss = cls_loss + self._box_loss_weight * box_loss
l2_regularization_loss = self.weight_decay_loss(trainable_variables)
total_loss = model_loss + l2_regularization_loss
return {
'total_loss': total_loss,
'cls_loss': cls_loss,
'box_loss': box_loss,
'model_loss': model_loss,
'l2_regularization_loss': l2_regularization_loss,
}
return _total_loss_fn
def build_model(self, params, mode=None):
if self._keras_model is None:
outputs = self.model_outputs(self._input_layer, mode)
model = tf.keras.models.Model(
inputs=self._input_layer, outputs=outputs, name='retinanet')
assert model is not None, 'Fail to build tf.keras.Model.'
model.optimizer = self.build_optimizer()
self._keras_model = model
return self._keras_model
def post_processing(self, labels, outputs):
# TODO(yeqing): Moves the output related part into build_outputs.
required_output_fields = ['cls_outputs', 'box_outputs']
for field in required_output_fields:
if field not in outputs:
raise ValueError('"%s" is missing in outputs, requried %s found %s' %
(field, required_output_fields, outputs.keys()))
required_label_fields = ['image_info', 'groundtruths']
for field in required_label_fields:
if field not in labels:
raise ValueError('"%s" is missing in outputs, requried %s found %s' %
(field, required_label_fields, labels.keys()))
boxes, scores, classes, valid_detections = self._generate_detections_fn(
outputs['box_outputs'], outputs['cls_outputs'], labels['anchor_boxes'],
labels['image_info'][:, 1:2, :])
# Discards the old output tensors to save memory. The `cls_outputs` and
# `box_outputs` are pretty big and could potentiall lead to memory issue.
outputs = {
'source_id': labels['groundtruths']['source_id'],
'image_info': labels['image_info'],
'num_detections': valid_detections,
'detection_boxes': boxes,
'detection_classes': classes,
'detection_scores': scores,
}
if 'groundtruths' in labels:
labels['source_id'] = labels['groundtruths']['source_id']
labels['boxes'] = labels['groundtruths']['boxes']
labels['classes'] = labels['groundtruths']['classes']
labels['areas'] = labels['groundtruths']['areas']
labels['is_crowds'] = labels['groundtruths']['is_crowds']
return labels, outputs
def eval_metrics(self):
return eval_factory.evaluator_generator(self._params.eval)
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model definition for the ShapeMask Model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.legacy.detection.dataloader import anchor
from official.legacy.detection.dataloader import mode_keys
from official.legacy.detection.evaluation import factory as eval_factory
from official.legacy.detection.modeling import base_model
from official.legacy.detection.modeling import losses
from official.legacy.detection.modeling.architecture import factory
from official.legacy.detection.ops import postprocess_ops
from official.legacy.detection.utils import box_utils
class ShapeMaskModel(base_model.Model):
"""ShapeMask model function."""
def __init__(self, params):
super(ShapeMaskModel, self).__init__(params)
self._params = params
self._keras_model = None
# Architecture generators.
self._backbone_fn = factory.backbone_generator(params)
self._fpn_fn = factory.multilevel_features_generator(params)
self._retinanet_head_fn = factory.retinanet_head_generator(params)
self._shape_prior_head_fn = factory.shapeprior_head_generator(params)
self._coarse_mask_fn = factory.coarsemask_head_generator(params)
self._fine_mask_fn = factory.finemask_head_generator(params)
# Loss functions.
self._cls_loss_fn = losses.RetinanetClassLoss(
params.retinanet_loss, params.architecture.num_classes)
self._box_loss_fn = losses.RetinanetBoxLoss(params.retinanet_loss)
self._box_loss_weight = params.retinanet_loss.box_loss_weight
# Mask loss function.
self._shapemask_prior_loss_fn = losses.ShapemaskMseLoss()
self._shapemask_loss_fn = losses.ShapemaskLoss()
self._shape_prior_loss_weight = (
params.shapemask_loss.shape_prior_loss_weight)
self._coarse_mask_loss_weight = (
params.shapemask_loss.coarse_mask_loss_weight)
self._fine_mask_loss_weight = (params.shapemask_loss.fine_mask_loss_weight)
# Predict function.
self._generate_detections_fn = postprocess_ops.MultilevelDetectionGenerator(
params.architecture.min_level, params.architecture.max_level,
params.postprocess)
def build_outputs(self, inputs, mode):
is_training = mode == mode_keys.TRAIN
images = inputs['image']
if 'anchor_boxes' in inputs:
anchor_boxes = inputs['anchor_boxes']
else:
anchor_boxes = anchor.Anchor(
self._params.architecture.min_level,
self._params.architecture.max_level, self._params.anchor.num_scales,
self._params.anchor.aspect_ratios, self._params.anchor.anchor_size,
images.get_shape().as_list()[1:3]).multilevel_boxes
batch_size = tf.shape(images)[0]
for level in anchor_boxes:
anchor_boxes[level] = tf.tile(
tf.expand_dims(anchor_boxes[level], 0), [batch_size, 1, 1, 1])
backbone_features = self._backbone_fn(images, is_training=is_training)
fpn_features = self._fpn_fn(backbone_features, is_training=is_training)
cls_outputs, box_outputs = self._retinanet_head_fn(
fpn_features, is_training=is_training)
valid_boxes, valid_scores, valid_classes, valid_detections = (
self._generate_detections_fn(box_outputs, cls_outputs, anchor_boxes,
inputs['image_info'][:, 1:2, :]))
image_size = images.get_shape().as_list()[1:3]
valid_outer_boxes = box_utils.compute_outer_boxes(
tf.reshape(valid_boxes, [-1, 4]),
image_size,
scale=self._params.shapemask_parser.outer_box_scale)
valid_outer_boxes = tf.reshape(valid_outer_boxes, tf.shape(valid_boxes))
# Wrapping if else code paths into a layer to make the checkpoint loadable
# in prediction mode.
class SampledBoxesLayer(tf.keras.layers.Layer):
"""ShapeMask model function."""
def call(self, inputs, val_boxes, val_classes, val_outer_boxes, training):
if training:
boxes = inputs['mask_boxes']
outer_boxes = inputs['mask_outer_boxes']
classes = inputs['mask_classes']
else:
boxes = val_boxes
classes = val_classes
outer_boxes = val_outer_boxes
return boxes, classes, outer_boxes
boxes, classes, outer_boxes = SampledBoxesLayer()(
inputs,
valid_boxes,
valid_classes,
valid_outer_boxes,
training=is_training)
instance_features, prior_masks = self._shape_prior_head_fn(
fpn_features, boxes, outer_boxes, classes, is_training)
coarse_mask_logits = self._coarse_mask_fn(instance_features, prior_masks,
classes, is_training)
fine_mask_logits = self._fine_mask_fn(instance_features, coarse_mask_logits,
classes, is_training)
model_outputs = {
'cls_outputs': cls_outputs,
'box_outputs': box_outputs,
'fine_mask_logits': fine_mask_logits,
'coarse_mask_logits': coarse_mask_logits,
'prior_masks': prior_masks,
}
if not is_training:
model_outputs.update({
'num_detections': valid_detections,
'detection_boxes': valid_boxes,
'detection_outer_boxes': valid_outer_boxes,
'detection_masks': fine_mask_logits,
'detection_classes': valid_classes,
'detection_scores': valid_scores,
})
return model_outputs
def build_loss_fn(self):
if self._keras_model is None:
raise ValueError('build_loss_fn() must be called after build_model().')
filter_fn = self.make_filter_trainable_variables_fn()
trainable_variables = filter_fn(self._keras_model.trainable_variables)
def _total_loss_fn(labels, outputs):
cls_loss = self._cls_loss_fn(outputs['cls_outputs'],
labels['cls_targets'],
labels['num_positives'])
box_loss = self._box_loss_fn(outputs['box_outputs'],
labels['box_targets'],
labels['num_positives'])
# Adds Shapemask model losses.
shape_prior_loss = self._shapemask_prior_loss_fn(outputs['prior_masks'],
labels['mask_targets'],
labels['mask_is_valid'])
coarse_mask_loss = self._shapemask_loss_fn(outputs['coarse_mask_logits'],
labels['mask_targets'],
labels['mask_is_valid'])
fine_mask_loss = self._shapemask_loss_fn(outputs['fine_mask_logits'],
labels['fine_mask_targets'],
labels['mask_is_valid'])
model_loss = (
cls_loss + self._box_loss_weight * box_loss +
shape_prior_loss * self._shape_prior_loss_weight +
coarse_mask_loss * self._coarse_mask_loss_weight +
fine_mask_loss * self._fine_mask_loss_weight)
l2_regularization_loss = self.weight_decay_loss(trainable_variables)
total_loss = model_loss + l2_regularization_loss
shapemask_losses = {
'total_loss': total_loss,
'loss': total_loss,
'retinanet_cls_loss': cls_loss,
'l2_regularization_loss': l2_regularization_loss,
'retinanet_box_loss': box_loss,
'shapemask_prior_loss': shape_prior_loss,
'shapemask_coarse_mask_loss': coarse_mask_loss,
'shapemask_fine_mask_loss': fine_mask_loss,
'model_loss': model_loss,
}
return shapemask_losses
return _total_loss_fn
def build_input_layers(self, params, mode):
is_training = mode == mode_keys.TRAIN
input_shape = (
params.shapemask_parser.output_size +
[params.shapemask_parser.num_channels])
if is_training:
batch_size = params.train.batch_size
input_layer = {
'image':
tf.keras.layers.Input(
shape=input_shape,
batch_size=batch_size,
name='image',
dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
'image_info':
tf.keras.layers.Input(
shape=[4, 2], batch_size=batch_size, name='image_info'),
'mask_classes':
tf.keras.layers.Input(
shape=[params.shapemask_parser.num_sampled_masks],
batch_size=batch_size,
name='mask_classes',
dtype=tf.int64),
'mask_outer_boxes':
tf.keras.layers.Input(
shape=[params.shapemask_parser.num_sampled_masks, 4],
batch_size=batch_size,
name='mask_outer_boxes',
dtype=tf.float32),
'mask_boxes':
tf.keras.layers.Input(
shape=[params.shapemask_parser.num_sampled_masks, 4],
batch_size=batch_size,
name='mask_boxes',
dtype=tf.float32),
}
else:
batch_size = params.eval.batch_size
input_layer = {
'image':
tf.keras.layers.Input(
shape=input_shape,
batch_size=batch_size,
name='image',
dtype=tf.bfloat16 if self._use_bfloat16 else tf.float32),
'image_info':
tf.keras.layers.Input(
shape=[4, 2], batch_size=batch_size, name='image_info'),
}
return input_layer
def build_model(self, params, mode):
if self._keras_model is None:
input_layers = self.build_input_layers(self._params, mode)
outputs = self.model_outputs(input_layers, mode)
model = tf.keras.models.Model(
inputs=input_layers, outputs=outputs, name='shapemask')
assert model is not None, 'Fail to build tf.keras.Model.'
model.optimizer = self.build_optimizer()
self._keras_model = model
return self._keras_model
def post_processing(self, labels, outputs):
required_output_fields = [
'num_detections', 'detection_boxes', 'detection_classes',
'detection_masks', 'detection_scores'
]
for field in required_output_fields:
if field not in outputs:
raise ValueError(
'"{}" is missing in outputs, requried {} found {}'.format(
field, required_output_fields, outputs.keys()))
required_label_fields = ['image_info']
for field in required_label_fields:
if field not in labels:
raise ValueError(
'"{}" is missing in labels, requried {} found {}'.format(
field, required_label_fields, labels.keys()))
predictions = {
'image_info': labels['image_info'],
'num_detections': outputs['num_detections'],
'detection_boxes': outputs['detection_boxes'],
'detection_outer_boxes': outputs['detection_outer_boxes'],
'detection_classes': outputs['detection_classes'],
'detection_scores': outputs['detection_scores'],
'detection_masks': outputs['detection_masks'],
}
if 'groundtruths' in labels:
predictions['source_id'] = labels['groundtruths']['source_id']
labels = labels['groundtruths']
return labels, predictions
def eval_metrics(self):
return eval_factory.evaluator_generator(self._params.eval)
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow implementation of non max suppression."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.legacy.detection.utils import box_utils
NMS_TILE_SIZE = 512
def _self_suppression(iou, _, iou_sum):
batch_size = tf.shape(iou)[0]
can_suppress_others = tf.cast(
tf.reshape(tf.reduce_max(iou, 1) <= 0.5, [batch_size, -1, 1]), iou.dtype)
iou_suppressed = tf.reshape(
tf.cast(tf.reduce_max(can_suppress_others * iou, 1) <= 0.5, iou.dtype),
[batch_size, -1, 1]) * iou
iou_sum_new = tf.reduce_sum(iou_suppressed, [1, 2])
return [
iou_suppressed,
tf.reduce_any(iou_sum - iou_sum_new > 0.5), iou_sum_new
]
def _cross_suppression(boxes, box_slice, iou_threshold, inner_idx):
batch_size = tf.shape(boxes)[0]
new_slice = tf.slice(boxes, [0, inner_idx * NMS_TILE_SIZE, 0],
[batch_size, NMS_TILE_SIZE, 4])
iou = box_utils.bbox_overlap(new_slice, box_slice)
ret_slice = tf.expand_dims(
tf.cast(tf.reduce_all(iou < iou_threshold, [1]), box_slice.dtype),
2) * box_slice
return boxes, ret_slice, iou_threshold, inner_idx + 1
def _suppression_loop_body(boxes, iou_threshold, output_size, idx):
"""Process boxes in the range [idx*NMS_TILE_SIZE, (idx+1)*NMS_TILE_SIZE).
Args:
boxes: a tensor with a shape of [batch_size, anchors, 4].
iou_threshold: a float representing the threshold for deciding whether boxes
overlap too much with respect to IOU.
output_size: an int32 tensor of size [batch_size]. Representing the number
of selected boxes for each batch.
idx: an integer scalar representing induction variable.
Returns:
boxes: updated boxes.
iou_threshold: pass down iou_threshold to the next iteration.
output_size: the updated output_size.
idx: the updated induction variable.
"""
boxes_shape = tf.shape(boxes)
num_tiles = boxes_shape[1] // NMS_TILE_SIZE
batch_size = boxes_shape[0]
# Iterates over tiles that can possibly suppress the current tile.
box_slice = tf.slice(boxes, [0, idx * NMS_TILE_SIZE, 0],
[batch_size, NMS_TILE_SIZE, 4])
_, box_slice, _, _ = tf.while_loop(
lambda _boxes, _box_slice, _threshold, inner_idx: inner_idx < idx,
_cross_suppression, [boxes, box_slice, iou_threshold,
tf.constant(0)])
# Iterates over the current tile to compute self-suppression.
iou = box_utils.bbox_overlap(box_slice, box_slice)
mask = tf.expand_dims(
tf.reshape(tf.range(NMS_TILE_SIZE), [1, -1]) > tf.reshape(
tf.range(NMS_TILE_SIZE), [-1, 1]), 0)
iou *= tf.cast(tf.logical_and(mask, iou >= iou_threshold), iou.dtype)
suppressed_iou, _, _ = tf.while_loop(
lambda _iou, loop_condition, _iou_sum: loop_condition, _self_suppression,
[iou, tf.constant(True),
tf.reduce_sum(iou, [1, 2])])
suppressed_box = tf.reduce_sum(suppressed_iou, 1) > 0
box_slice *= tf.expand_dims(1.0 - tf.cast(suppressed_box, box_slice.dtype), 2)
# Uses box_slice to update the input boxes.
mask = tf.reshape(
tf.cast(tf.equal(tf.range(num_tiles), idx), boxes.dtype), [1, -1, 1, 1])
boxes = tf.tile(tf.expand_dims(
box_slice, [1]), [1, num_tiles, 1, 1]) * mask + tf.reshape(
boxes, [batch_size, num_tiles, NMS_TILE_SIZE, 4]) * (1 - mask)
boxes = tf.reshape(boxes, boxes_shape)
# Updates output_size.
output_size += tf.reduce_sum(
tf.cast(tf.reduce_any(box_slice > 0, [2]), tf.int32), [1])
return boxes, iou_threshold, output_size, idx + 1
def sorted_non_max_suppression_padded(scores, boxes, max_output_size,
iou_threshold):
"""A wrapper that handles non-maximum suppression.
Assumption:
* The boxes are sorted by scores unless the box is a dot (all coordinates
are zero).
* Boxes with higher scores can be used to suppress boxes with lower scores.
The overal design of the algorithm is to handle boxes tile-by-tile:
boxes = boxes.pad_to_multiply_of(tile_size)
num_tiles = len(boxes) // tile_size
output_boxes = []
for i in range(num_tiles):
box_tile = boxes[i*tile_size : (i+1)*tile_size]
for j in range(i - 1):
suppressing_tile = boxes[j*tile_size : (j+1)*tile_size]
iou = bbox_overlap(box_tile, suppressing_tile)
# if the box is suppressed in iou, clear it to a dot
box_tile *= _update_boxes(iou)
# Iteratively handle the diagnal tile.
iou = _box_overlap(box_tile, box_tile)
iou_changed = True
while iou_changed:
# boxes that are not suppressed by anything else
suppressing_boxes = _get_suppressing_boxes(iou)
# boxes that are suppressed by suppressing_boxes
suppressed_boxes = _get_suppressed_boxes(iou, suppressing_boxes)
# clear iou to 0 for boxes that are suppressed, as they cannot be used
# to suppress other boxes any more
new_iou = _clear_iou(iou, suppressed_boxes)
iou_changed = (new_iou != iou)
iou = new_iou
# remaining boxes that can still suppress others, are selected boxes.
output_boxes.append(_get_suppressing_boxes(iou))
if len(output_boxes) >= max_output_size:
break
Args:
scores: a tensor with a shape of [batch_size, anchors].
boxes: a tensor with a shape of [batch_size, anchors, 4].
max_output_size: a scalar integer `Tensor` representing the maximum number
of boxes to be selected by non max suppression.
iou_threshold: a float representing the threshold for deciding whether boxes
overlap too much with respect to IOU.
Returns:
nms_scores: a tensor with a shape of [batch_size, anchors]. It has same
dtype as input scores.
nms_proposals: a tensor with a shape of [batch_size, anchors, 4]. It has
same dtype as input boxes.
"""
batch_size = tf.shape(boxes)[0]
num_boxes = tf.shape(boxes)[1]
pad = tf.cast(
tf.math.ceil(tf.cast(num_boxes, tf.float32) / NMS_TILE_SIZE),
tf.int32) * NMS_TILE_SIZE - num_boxes
boxes = tf.pad(tf.cast(boxes, tf.float32), [[0, 0], [0, pad], [0, 0]])
scores = tf.pad(
tf.cast(scores, tf.float32), [[0, 0], [0, pad]], constant_values=-1)
num_boxes += pad
def _loop_cond(unused_boxes, unused_threshold, output_size, idx):
return tf.logical_and(
tf.reduce_min(output_size) < max_output_size,
idx < num_boxes // NMS_TILE_SIZE)
selected_boxes, _, output_size, _ = tf.while_loop(
_loop_cond, _suppression_loop_body,
[boxes, iou_threshold,
tf.zeros([batch_size], tf.int32),
tf.constant(0)])
idx = num_boxes - tf.cast(
tf.nn.top_k(
tf.cast(tf.reduce_any(selected_boxes > 0, [2]), tf.int32) *
tf.expand_dims(tf.range(num_boxes, 0, -1), 0), max_output_size)[0],
tf.int32)
idx = tf.minimum(idx, num_boxes - 1)
idx = tf.reshape(idx + tf.reshape(tf.range(batch_size) * num_boxes, [-1, 1]),
[-1])
boxes = tf.reshape(
tf.gather(tf.reshape(boxes, [-1, 4]), idx),
[batch_size, max_output_size, 4])
boxes = boxes * tf.cast(
tf.reshape(tf.range(max_output_size), [1, -1, 1]) < tf.reshape(
output_size, [-1, 1, 1]), boxes.dtype)
scores = tf.reshape(
tf.gather(tf.reshape(scores, [-1, 1]), idx),
[batch_size, max_output_size])
scores = scores * tf.cast(
tf.reshape(tf.range(max_output_size), [1, -1]) < tf.reshape(
output_size, [-1, 1]), scores.dtype)
return scores, boxes
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Post-processing model outputs to generate detection."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import tensorflow as tf
from official.legacy.detection.ops import nms
from official.legacy.detection.utils import box_utils
def generate_detections_factory(params):
"""Factory to select function to generate detection."""
if params.use_batched_nms:
func = functools.partial(
_generate_detections_batched,
max_total_size=params.max_total_size,
nms_iou_threshold=params.nms_iou_threshold,
score_threshold=params.score_threshold)
else:
func = functools.partial(
_generate_detections,
max_total_size=params.max_total_size,
nms_iou_threshold=params.nms_iou_threshold,
score_threshold=params.score_threshold,
pre_nms_num_boxes=params.pre_nms_num_boxes)
return func
def _select_top_k_scores(scores_in, pre_nms_num_detections):
"""Select top_k scores and indices for each class.
Args:
scores_in: a Tensor with shape [batch_size, N, num_classes], which stacks
class logit outputs on all feature levels. The N is the number of total
anchors on all levels. The num_classes is the number of classes predicted
by the model.
pre_nms_num_detections: Number of candidates before NMS.
Returns:
scores and indices: Tensors with shape [batch_size, pre_nms_num_detections,
num_classes].
"""
batch_size, num_anchors, num_class = scores_in.get_shape().as_list()
scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
scores_trans = tf.reshape(scores_trans, [-1, num_anchors])
top_k_scores, top_k_indices = tf.nn.top_k(
scores_trans, k=pre_nms_num_detections, sorted=True)
top_k_scores = tf.reshape(top_k_scores,
[batch_size, num_class, pre_nms_num_detections])
top_k_indices = tf.reshape(top_k_indices,
[batch_size, num_class, pre_nms_num_detections])
return tf.transpose(top_k_scores,
[0, 2, 1]), tf.transpose(top_k_indices, [0, 2, 1])
def _generate_detections(boxes,
scores,
max_total_size=100,
nms_iou_threshold=0.3,
score_threshold=0.05,
pre_nms_num_boxes=5000):
"""Generate the final detections given the model outputs.
This uses classes unrolling with while loop based NMS, could be parralled
at batch dimension.
Args:
boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
N, 1, 4], which box predictions on all feature levels. The N is the number
of total anchors on all levels.
scores: a tensor with shape [batch_size, N, num_classes], which stacks class
probability on all feature levels. The N is the number of total anchors on
all levels. The num_classes is the number of classes predicted by the
model. Note that the class_outputs here is the raw score.
max_total_size: a scalar representing maximum number of boxes retained over
all classes.
nms_iou_threshold: a float representing the threshold for deciding whether
boxes overlap too much with respect to IOU.
score_threshold: a float representing the threshold for deciding when to
remove boxes based on score.
pre_nms_num_boxes: an int number of top candidate detections per class
before NMS.
Returns:
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
representing top detected boxes in [y1, x1, y2, x2].
nms_scores: `float` Tensor of shape [batch_size, max_total_size]
representing sorted confidence scores for detected boxes. The values are
between [0, 1].
nms_classes: `int` Tensor of shape [batch_size, max_total_size] representing
classes for detected boxes.
valid_detections: `int` Tensor of shape [batch_size] only the top
`valid_detections` boxes are valid detections.
"""
with tf.name_scope('generate_detections'):
nmsed_boxes = []
nmsed_classes = []
nmsed_scores = []
valid_detections = []
batch_size, _, num_classes_for_box, _ = boxes.get_shape().as_list()
_, total_anchors, num_classes = scores.get_shape().as_list()
# Selects top pre_nms_num scores and indices before NMS.
scores, indices = _select_top_k_scores(
scores, min(total_anchors, pre_nms_num_boxes))
for i in range(num_classes):
boxes_i = boxes[:, :, min(num_classes_for_box - 1, i), :]
scores_i = scores[:, :, i]
# Obtains pre_nms_num_boxes before running NMS.
boxes_i = tf.gather(boxes_i, indices[:, :, i], batch_dims=1, axis=1)
# Filter out scores.
boxes_i, scores_i = box_utils.filter_boxes_by_scores(
boxes_i, scores_i, min_score_threshold=score_threshold)
(nmsed_scores_i, nmsed_boxes_i) = nms.sorted_non_max_suppression_padded(
tf.cast(scores_i, tf.float32),
tf.cast(boxes_i, tf.float32),
max_total_size,
iou_threshold=nms_iou_threshold)
nmsed_classes_i = tf.fill([batch_size, max_total_size], i)
nmsed_boxes.append(nmsed_boxes_i)
nmsed_scores.append(nmsed_scores_i)
nmsed_classes.append(nmsed_classes_i)
nmsed_boxes = tf.concat(nmsed_boxes, axis=1)
nmsed_scores = tf.concat(nmsed_scores, axis=1)
nmsed_classes = tf.concat(nmsed_classes, axis=1)
nmsed_scores, indices = tf.nn.top_k(
nmsed_scores, k=max_total_size, sorted=True)
nmsed_boxes = tf.gather(nmsed_boxes, indices, batch_dims=1, axis=1)
nmsed_classes = tf.gather(nmsed_classes, indices, batch_dims=1)
valid_detections = tf.reduce_sum(
input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32), axis=1)
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
def _generate_detections_per_image(boxes,
scores,
max_total_size=100,
nms_iou_threshold=0.3,
score_threshold=0.05,
pre_nms_num_boxes=5000):
"""Generate the final detections per image given the model outputs.
Args:
boxes: a tensor with shape [N, num_classes, 4] or [N, 1, 4], which box
predictions on all feature levels. The N is the number of total anchors on
all levels.
scores: a tensor with shape [N, num_classes], which stacks class probability
on all feature levels. The N is the number of total anchors on all levels.
The num_classes is the number of classes predicted by the model. Note that
the class_outputs here is the raw score.
max_total_size: a scalar representing maximum number of boxes retained over
all classes.
nms_iou_threshold: a float representing the threshold for deciding whether
boxes overlap too much with respect to IOU.
score_threshold: a float representing the threshold for deciding when to
remove boxes based on score.
pre_nms_num_boxes: an int number of top candidate detections per class
before NMS.
Returns:
nms_boxes: `float` Tensor of shape [max_total_size, 4] representing top
detected boxes in [y1, x1, y2, x2].
nms_scores: `float` Tensor of shape [max_total_size] representing sorted
confidence scores for detected boxes. The values are between [0, 1].
nms_classes: `int` Tensor of shape [max_total_size] representing classes for
detected boxes.
valid_detections: `int` Tensor of shape [1] only the top `valid_detections`
boxes are valid detections.
"""
nmsed_boxes = []
nmsed_scores = []
nmsed_classes = []
num_classes_for_box = boxes.get_shape().as_list()[1]
num_classes = scores.get_shape().as_list()[1]
for i in range(num_classes):
boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
scores_i = scores[:, i]
# Obtains pre_nms_num_boxes before running NMS.
scores_i, indices = tf.nn.top_k(
scores_i, k=tf.minimum(tf.shape(input=scores_i)[-1], pre_nms_num_boxes))
boxes_i = tf.gather(boxes_i, indices)
(nmsed_indices_i, nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
tf.cast(boxes_i, tf.float32),
tf.cast(scores_i, tf.float32),
max_total_size,
iou_threshold=nms_iou_threshold,
score_threshold=score_threshold,
pad_to_max_output_size=True,
name='nms_detections_' + str(i))
nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
# Sets scores of invalid boxes to -1.
nmsed_scores_i = tf.where(
tf.less(tf.range(max_total_size), [nmsed_num_valid_i]), nmsed_scores_i,
-tf.ones_like(nmsed_scores_i))
nmsed_classes_i = tf.fill([max_total_size], i)
nmsed_boxes.append(nmsed_boxes_i)
nmsed_scores.append(nmsed_scores_i)
nmsed_classes.append(nmsed_classes_i)
# Concats results from all classes and sort them.
nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
nmsed_scores = tf.concat(nmsed_scores, axis=0)
nmsed_classes = tf.concat(nmsed_classes, axis=0)
nmsed_scores, indices = tf.nn.top_k(
nmsed_scores, k=max_total_size, sorted=True)
nmsed_boxes = tf.gather(nmsed_boxes, indices)
nmsed_classes = tf.gather(nmsed_classes, indices)
valid_detections = tf.reduce_sum(
input_tensor=tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
def _generate_detections_batched(boxes, scores, max_total_size,
nms_iou_threshold, score_threshold):
"""Generates detected boxes with scores and classes for one-stage detector.
The function takes output of multi-level ConvNets and anchor boxes and
generates detected boxes. Note that this used batched nms, which is not
supported on TPU currently.
Args:
boxes: a tensor with shape [batch_size, N, num_classes, 4] or [batch_size,
N, 1, 4], which box predictions on all feature levels. The N is the number
of total anchors on all levels.
scores: a tensor with shape [batch_size, N, num_classes], which stacks class
probability on all feature levels. The N is the number of total anchors on
all levels. The num_classes is the number of classes predicted by the
model. Note that the class_outputs here is the raw score.
max_total_size: a scalar representing maximum number of boxes retained over
all classes.
nms_iou_threshold: a float representing the threshold for deciding whether
boxes overlap too much with respect to IOU.
score_threshold: a float representing the threshold for deciding when to
remove boxes based on score.
Returns:
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
representing top detected boxes in [y1, x1, y2, x2].
nms_scores: `float` Tensor of shape [batch_size, max_total_size]
representing sorted confidence scores for detected boxes. The values are
between [0, 1].
nms_classes: `int` Tensor of shape [batch_size, max_total_size] representing
classes for detected boxes.
valid_detections: `int` Tensor of shape [batch_size] only the top
`valid_detections` boxes are valid detections.
"""
with tf.name_scope('generate_detections'):
# TODO(tsungyi): Removes normalization/denomalization once the
# tf.image.combined_non_max_suppression is coordinate system agnostic.
# Normalizes maximum box cooridinates to 1.
normalizer = tf.reduce_max(boxes)
boxes /= normalizer
(nmsed_boxes, nmsed_scores, nmsed_classes,
valid_detections) = tf.image.combined_non_max_suppression(
boxes,
scores,
max_output_size_per_class=max_total_size,
max_total_size=max_total_size,
iou_threshold=nms_iou_threshold,
score_threshold=score_threshold,
pad_per_class=False,
)
# De-normalizes box cooridinates.
nmsed_boxes *= normalizer
nmsed_classes = tf.cast(nmsed_classes, tf.int32)
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
class MultilevelDetectionGenerator(tf.keras.layers.Layer):
"""Generates detected boxes with scores and classes for one-stage detector."""
def __init__(self, min_level, max_level, params):
self._min_level = min_level
self._max_level = max_level
self._generate_detections = generate_detections_factory(params)
super(MultilevelDetectionGenerator, self).__init__(autocast=False)
def call(self, box_outputs, class_outputs, anchor_boxes, image_shape):
# Collects outputs from all levels into a list.
boxes = []
scores = []
for i in range(self._min_level, self._max_level + 1):
box_outputs_i_shape = tf.shape(box_outputs[i])
batch_size = box_outputs_i_shape[0]
num_anchors_per_locations = box_outputs_i_shape[-1] // 4
num_classes = tf.shape(class_outputs[i])[-1] // num_anchors_per_locations
# Applies score transformation and remove the implicit background class.
scores_i = tf.sigmoid(
tf.reshape(class_outputs[i], [batch_size, -1, num_classes]))
scores_i = tf.slice(scores_i, [0, 0, 1], [-1, -1, -1])
# Box decoding.
# The anchor boxes are shared for all data in a batch.
# One stage detector only supports class agnostic box regression.
anchor_boxes_i = tf.reshape(anchor_boxes[i], [batch_size, -1, 4])
box_outputs_i = tf.reshape(box_outputs[i], [batch_size, -1, 4])
boxes_i = box_utils.decode_boxes(box_outputs_i, anchor_boxes_i)
# Box clipping.
boxes_i = box_utils.clip_boxes(boxes_i, image_shape)
boxes.append(boxes_i)
scores.append(scores_i)
boxes = tf.concat(boxes, axis=1)
scores = tf.concat(scores, axis=1)
nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
self._generate_detections(tf.expand_dims(boxes, axis=2), scores))
# Adds 1 to offset the background class which has index 0.
nmsed_classes += 1
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
class GenericDetectionGenerator(tf.keras.layers.Layer):
"""Generates the final detected boxes with scores and classes."""
def __init__(self, params):
super(GenericDetectionGenerator, self).__init__(autocast=False)
self._generate_detections = generate_detections_factory(params)
def call(self, box_outputs, class_outputs, anchor_boxes, image_shape):
"""Generate final detections.
Args:
box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
representing the class-specific box coordinates relative to anchors.
class_outputs: a tensor of shape of [batch_size, K, num_classes]
representing the class logits before applying score activiation.
anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
corresponding anchor boxes w.r.t `box_outputs`.
image_shape: a tensor of shape of [batch_size, 2] storing the image height
and width w.r.t. the scaled image, i.e. the same image space as
`box_outputs` and `anchor_boxes`.
Returns:
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
representing top detected boxes in [y1, x1, y2, x2].
nms_scores: `float` Tensor of shape [batch_size, max_total_size]
representing sorted confidence scores for detected boxes. The values are
between [0, 1].
nms_classes: `int` Tensor of shape [batch_size, max_total_size]
representing classes for detected boxes.
valid_detections: `int` Tensor of shape [batch_size] only the top
`valid_detections` boxes are valid detections.
"""
class_outputs = tf.nn.softmax(class_outputs, axis=-1)
# Removes the background class.
class_outputs_shape = tf.shape(class_outputs)
batch_size = class_outputs_shape[0]
num_locations = class_outputs_shape[1]
num_classes = class_outputs_shape[-1]
num_detections = num_locations * (num_classes - 1)
class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
box_outputs = tf.reshape(
box_outputs,
tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
anchor_boxes = tf.tile(
tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
box_outputs = tf.reshape(box_outputs,
tf.stack([batch_size, num_detections, 4], axis=-1))
anchor_boxes = tf.reshape(
anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))
# Box decoding.
decoded_boxes = box_utils.decode_boxes(
box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])
# Box clipping
decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)
decoded_boxes = tf.reshape(
decoded_boxes,
tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))
nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
self._generate_detections(decoded_boxes, class_outputs))
# Adds 1 to offset the background class which has index 0.
nmsed_classes += 1
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
class OlnDetectionGenerator(GenericDetectionGenerator):
"""Generates the final detected boxes with scores and classes."""
def __call__(self, box_outputs, class_outputs, anchor_boxes, image_shape,
is_single_fg_score=False, keep_nms=True):
"""Generate final detections for Object Localization Network (OLN).
Args:
box_outputs: a tensor of shape of [batch_size, K, num_classes * 4]
representing the class-specific box coordinates relative to anchors.
class_outputs: a tensor of shape of [batch_size, K, num_classes]
representing the class logits before applying score activiation.
anchor_boxes: a tensor of shape of [batch_size, K, 4] representing the
corresponding anchor boxes w.r.t `box_outputs`.
image_shape: a tensor of shape of [batch_size, 2] storing the image height
and width w.r.t. the scaled image, i.e. the same image space as
`box_outputs` and `anchor_boxes`.
is_single_fg_score: a Bool indicator of whether class_outputs includes the
background scores concatenated or not. By default, class_outputs is a
concatenation of both scores for the foreground and background. That is,
scores_without_bg=False.
keep_nms: a Bool indicator of whether to perform NMS or not.
Returns:
nms_boxes: `float` Tensor of shape [batch_size, max_total_size, 4]
representing top detected boxes in [y1, x1, y2, x2].
nms_scores: `float` Tensor of shape [batch_size, max_total_size]
representing sorted confidence scores for detected boxes. The values are
between [0, 1].
nms_classes: `int` Tensor of shape [batch_size, max_total_size]
representing classes for detected boxes.
valid_detections: `int` Tensor of shape [batch_size] only the top
`valid_detections` boxes are valid detections.
"""
if is_single_fg_score:
# Concatenates dummy background scores.
dummy_bg_scores = tf.zeros_like(class_outputs)
class_outputs = tf.stack([dummy_bg_scores, class_outputs], -1)
else:
class_outputs = tf.nn.softmax(class_outputs, axis=-1)
# Removes the background class.
class_outputs_shape = tf.shape(class_outputs)
batch_size = class_outputs_shape[0]
num_locations = class_outputs_shape[1]
num_classes = class_outputs_shape[-1]
num_detections = num_locations * (num_classes - 1)
class_outputs = tf.slice(class_outputs, [0, 0, 1], [-1, -1, -1])
box_outputs = tf.reshape(
box_outputs,
tf.stack([batch_size, num_locations, num_classes, 4], axis=-1))
box_outputs = tf.slice(box_outputs, [0, 0, 1, 0], [-1, -1, -1, -1])
anchor_boxes = tf.tile(
tf.expand_dims(anchor_boxes, axis=2), [1, 1, num_classes - 1, 1])
box_outputs = tf.reshape(box_outputs,
tf.stack([batch_size, num_detections, 4], axis=-1))
anchor_boxes = tf.reshape(
anchor_boxes, tf.stack([batch_size, num_detections, 4], axis=-1))
# Box decoding. For RPN outputs, box_outputs are all zeros.
decoded_boxes = box_utils.decode_boxes(
box_outputs, anchor_boxes, weights=[10.0, 10.0, 5.0, 5.0])
# Box clipping
decoded_boxes = box_utils.clip_boxes(decoded_boxes, image_shape)
decoded_boxes = tf.reshape(
decoded_boxes,
tf.stack([batch_size, num_locations, num_classes - 1, 4], axis=-1))
if keep_nms:
nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections = (
self._generate_detections(decoded_boxes, class_outputs))
# Adds 1 to offset the background class which has index 0.
nmsed_classes += 1
else:
nmsed_boxes = decoded_boxes[:, :, 0, :]
nmsed_scores = class_outputs[:, :, 0]
nmsed_classes = tf.cast(tf.ones_like(nmsed_scores), tf.int32)
valid_detections = tf.cast(
tf.reduce_sum(tf.ones_like(nmsed_scores), axis=-1), tf.int32)
return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ROI-related ops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.legacy.detection.ops import nms
from official.legacy.detection.utils import box_utils
def multilevel_propose_rois(rpn_boxes,
rpn_scores,
anchor_boxes,
image_shape,
rpn_pre_nms_top_k=2000,
rpn_post_nms_top_k=1000,
rpn_nms_threshold=0.7,
rpn_score_threshold=0.0,
rpn_min_size_threshold=0.0,
decode_boxes=True,
clip_boxes=True,
use_batched_nms=False,
apply_sigmoid_to_score=True):
"""Proposes RoIs given a group of candidates from different FPN levels.
The following describes the steps:
1. For each individual level:
a. Apply sigmoid transform if specified.
b. Decode boxes if specified.
c. Clip boxes if specified.
d. Filter small boxes and those fall outside image if specified.
e. Apply pre-NMS filtering including pre-NMS top k and score thresholding.
f. Apply NMS.
2. Aggregate post-NMS boxes from each level.
3. Apply an overall top k to generate the final selected RoIs.
Args:
rpn_boxes: a dict with keys representing FPN levels and values representing
box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
rpn_scores: a dict with keys representing FPN levels and values representing
logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
anchor_boxes: a dict with keys representing FPN levels and values
representing anchor box tensors of shape [batch_size, feature_h,
feature_w, num_anchors * 4].
image_shape: a tensor of shape [batch_size, 2] where the last dimension are
[height, width] of the scaled image.
rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
keep before applying NMS. Default: 2000.
rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to
keep after applying NMS. Default: 1000.
rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold
used for NMS. If 0.0, no NMS is applied. Default: 0.7.
rpn_score_threshold: a float between 0 and 1 representing the minimal box
score to keep before applying NMS. This is often used as a pre-filtering
step for better performance. If 0, no filtering is applied. Default: 0.
rpn_min_size_threshold: a float representing the minimal box size in each
side (w.r.t. the scaled image) to keep before applying NMS. This is often
used as a pre-filtering step for better performance. If 0, no filtering is
applied. Default: 0.
decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded
using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore
`anchor_boxes`. Default: True.
clip_boxes: a boolean indicating whether boxes are first clipped to the
scaled image size before appliying NMS. If False, no clipping is applied
and `image_shape` is ignored. Default: True.
use_batched_nms: a boolean indicating whether NMS is applied in batch using
`tf.image.combined_non_max_suppression`. Currently only available in
CPU/GPU. Default: False.
apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
`rpn_scores` before applying NMS. Default: True.
Returns:
selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
representing the box coordinates of the selected proposals w.r.t. the
scaled image.
selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k, 1],
representing the scores of the selected proposals.
"""
with tf.name_scope('multilevel_propose_rois'):
rois = []
roi_scores = []
image_shape = tf.expand_dims(image_shape, axis=1)
for level in sorted(rpn_scores.keys()):
with tf.name_scope('level_%d' % level):
_, feature_h, feature_w, num_anchors_per_location = (
rpn_scores[level].get_shape().as_list())
num_boxes = feature_h * feature_w * num_anchors_per_location
this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes])
this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4])
this_level_anchors = tf.cast(
tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
dtype=this_level_scores.dtype)
if apply_sigmoid_to_score:
this_level_scores = tf.sigmoid(this_level_scores)
if decode_boxes:
this_level_boxes = box_utils.decode_boxes(this_level_boxes,
this_level_anchors)
if clip_boxes:
this_level_boxes = box_utils.clip_boxes(this_level_boxes, image_shape)
if rpn_min_size_threshold > 0.0:
this_level_boxes, this_level_scores = box_utils.filter_boxes(
this_level_boxes, this_level_scores, image_shape,
rpn_min_size_threshold)
this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k)
this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k)
if rpn_nms_threshold > 0.0:
if use_batched_nms:
this_level_rois, this_level_roi_scores, _, _ = (
tf.image.combined_non_max_suppression(
tf.expand_dims(this_level_boxes, axis=2),
tf.expand_dims(this_level_scores, axis=-1),
max_output_size_per_class=this_level_pre_nms_top_k,
max_total_size=this_level_post_nms_top_k,
iou_threshold=rpn_nms_threshold,
score_threshold=rpn_score_threshold,
pad_per_class=False,
clip_boxes=False))
else:
if rpn_score_threshold > 0.0:
this_level_boxes, this_level_scores = (
box_utils.filter_boxes_by_scores(this_level_boxes,
this_level_scores,
rpn_score_threshold))
this_level_boxes, this_level_scores = box_utils.top_k_boxes(
this_level_boxes, this_level_scores, k=this_level_pre_nms_top_k)
this_level_roi_scores, this_level_rois = (
nms.sorted_non_max_suppression_padded(
this_level_scores,
this_level_boxes,
max_output_size=this_level_post_nms_top_k,
iou_threshold=rpn_nms_threshold))
else:
this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
this_level_rois, this_level_scores, k=this_level_post_nms_top_k)
rois.append(this_level_rois)
roi_scores.append(this_level_roi_scores)
all_rois = tf.concat(rois, axis=1)
all_roi_scores = tf.concat(roi_scores, axis=1)
with tf.name_scope('top_k_rois'):
_, num_valid_rois = all_roi_scores.get_shape().as_list()
overall_top_k = min(num_valid_rois, rpn_post_nms_top_k)
selected_rois, selected_roi_scores = box_utils.top_k_boxes(
all_rois, all_roi_scores, k=overall_top_k)
return selected_rois, selected_roi_scores
class ROIGenerator(tf.keras.layers.Layer):
"""Proposes RoIs for the second stage processing."""
def __init__(self, params):
self._rpn_pre_nms_top_k = params.rpn_pre_nms_top_k
self._rpn_post_nms_top_k = params.rpn_post_nms_top_k
self._rpn_nms_threshold = params.rpn_nms_threshold
self._rpn_score_threshold = params.rpn_score_threshold
self._rpn_min_size_threshold = params.rpn_min_size_threshold
self._test_rpn_pre_nms_top_k = params.test_rpn_pre_nms_top_k
self._test_rpn_post_nms_top_k = params.test_rpn_post_nms_top_k
self._test_rpn_nms_threshold = params.test_rpn_nms_threshold
self._test_rpn_score_threshold = params.test_rpn_score_threshold
self._test_rpn_min_size_threshold = params.test_rpn_min_size_threshold
self._use_batched_nms = params.use_batched_nms
super(ROIGenerator, self).__init__(autocast=False)
def call(self, boxes, scores, anchor_boxes, image_shape, is_training):
"""Generates RoI proposals.
Args:
boxes: a dict with keys representing FPN levels and values representing
box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
scores: a dict with keys representing FPN levels and values representing
logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
anchor_boxes: a dict with keys representing FPN levels and values
representing anchor box tensors of shape [batch_size, feature_h,
feature_w, num_anchors * 4].
image_shape: a tensor of shape [batch_size, 2] where the last dimension
are [height, width] of the scaled image.
is_training: a bool indicating whether it is in training or inference
mode.
Returns:
proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
representing the box coordinates of the proposed RoIs w.r.t. the
scaled image.
proposed_roi_scores: a tensor of shape
[batch_size, rpn_post_nms_top_k, 1], representing the scores of the
proposed RoIs.
"""
proposed_rois, proposed_roi_scores = multilevel_propose_rois(
boxes,
scores,
anchor_boxes,
image_shape,
rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k
if is_training else self._test_rpn_pre_nms_top_k),
rpn_post_nms_top_k=(self._rpn_post_nms_top_k
if is_training else self._test_rpn_post_nms_top_k),
rpn_nms_threshold=(self._rpn_nms_threshold
if is_training else self._test_rpn_nms_threshold),
rpn_score_threshold=(self._rpn_score_threshold if is_training else
self._test_rpn_score_threshold),
rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else
self._test_rpn_min_size_threshold),
decode_boxes=True,
clip_boxes=True,
use_batched_nms=self._use_batched_nms,
apply_sigmoid_to_score=True)
return proposed_rois, proposed_roi_scores
class OlnROIGenerator(ROIGenerator):
"""Proposes RoIs for the second stage processing."""
def __call__(self, boxes, scores, anchor_boxes, image_shape, is_training,
is_box_lrtb=False, object_scores=None):
"""Generates RoI proposals.
Args:
boxes: a dict with keys representing FPN levels and values representing
box tenors of shape [batch_size, feature_h, feature_w, num_anchors * 4].
scores: a dict with keys representing FPN levels and values representing
logit tensors of shape [batch_size, feature_h, feature_w, num_anchors].
anchor_boxes: a dict with keys representing FPN levels and values
representing anchor box tensors of shape [batch_size, feature_h,
feature_w, num_anchors * 4].
image_shape: a tensor of shape [batch_size, 2] where the last dimension
are [height, width] of the scaled image.
is_training: a bool indicating whether it is in training or inference
mode.
is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
bottom) format.
object_scores: another objectness score (e.g., centerness). In OLN, we use
object_scores=centerness as a replacement of the scores at each level.
A dict with keys representing FPN levels and values representing logit
tensors of shape [batch_size, feature_h, feature_w, num_anchors].
Returns:
proposed_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
representing the box coordinates of the proposed RoIs w.r.t. the
scaled image.
proposed_roi_scores: a tensor of shape
[batch_size, rpn_post_nms_top_k, 1], representing the scores of the
proposed RoIs.
"""
proposed_rois, proposed_roi_scores = self.oln_multilevel_propose_rois(
boxes,
scores,
anchor_boxes,
image_shape,
rpn_pre_nms_top_k=(self._rpn_pre_nms_top_k
if is_training else self._test_rpn_pre_nms_top_k),
rpn_post_nms_top_k=(self._rpn_post_nms_top_k
if is_training else self._test_rpn_post_nms_top_k),
rpn_nms_threshold=(self._rpn_nms_threshold
if is_training else self._test_rpn_nms_threshold),
rpn_score_threshold=(self._rpn_score_threshold if is_training else
self._test_rpn_score_threshold),
rpn_min_size_threshold=(self._rpn_min_size_threshold if is_training else
self._test_rpn_min_size_threshold),
decode_boxes=True,
clip_boxes=True,
use_batched_nms=self._use_batched_nms,
apply_sigmoid_to_score=True,
is_box_lrtb=is_box_lrtb,
rpn_object_scores=object_scores,)
return proposed_rois, proposed_roi_scores
def oln_multilevel_propose_rois(self,
rpn_boxes,
rpn_scores,
anchor_boxes,
image_shape,
rpn_pre_nms_top_k=2000,
rpn_post_nms_top_k=1000,
rpn_nms_threshold=0.7,
rpn_score_threshold=0.0,
rpn_min_size_threshold=0.0,
decode_boxes=True,
clip_boxes=True,
use_batched_nms=False,
apply_sigmoid_to_score=True,
is_box_lrtb=False,
rpn_object_scores=None,):
"""Proposes RoIs given a group of candidates from different FPN levels.
The following describes the steps:
1. For each individual level:
a. Adjust scores for each level if specified by rpn_object_scores.
b. Apply sigmoid transform if specified.
c. Decode boxes (either of xyhw or left-right-top-bottom format) if
specified.
d. Clip boxes if specified.
e. Filter small boxes and those fall outside image if specified.
f. Apply pre-NMS filtering including pre-NMS top k and score
thresholding.
g. Apply NMS.
2. Aggregate post-NMS boxes from each level.
3. Apply an overall top k to generate the final selected RoIs.
Args:
rpn_boxes: a dict with keys representing FPN levels and values
representing box tenors of shape [batch_size, feature_h, feature_w,
num_anchors * 4].
rpn_scores: a dict with keys representing FPN levels and values
representing logit tensors of shape [batch_size, feature_h, feature_w,
num_anchors].
anchor_boxes: a dict with keys representing FPN levels and values
representing anchor box tensors of shape [batch_size, feature_h,
feature_w, num_anchors * 4].
image_shape: a tensor of shape [batch_size, 2] where the last dimension
are [height, width] of the scaled image.
rpn_pre_nms_top_k: an integer of top scoring RPN proposals *per level* to
keep before applying NMS. Default: 2000.
rpn_post_nms_top_k: an integer of top scoring RPN proposals *in total* to
keep after applying NMS. Default: 1000.
rpn_nms_threshold: a float between 0 and 1 representing the IoU threshold
used for NMS. If 0.0, no NMS is applied. Default: 0.7.
rpn_score_threshold: a float between 0 and 1 representing the minimal box
score to keep before applying NMS. This is often used as a pre-filtering
step for better performance. If 0, no filtering is applied. Default: 0.
rpn_min_size_threshold: a float representing the minimal box size in each
side (w.r.t. the scaled image) to keep before applying NMS. This is
often used as a pre-filtering step for better performance. If 0, no
filtering is applied. Default: 0.
decode_boxes: a boolean indicating whether `rpn_boxes` needs to be decoded
using `anchor_boxes`. If False, use `rpn_boxes` directly and ignore
`anchor_boxes`. Default: True.
clip_boxes: a boolean indicating whether boxes are first clipped to the
scaled image size before appliying NMS. If False, no clipping is applied
and `image_shape` is ignored. Default: True.
use_batched_nms: a boolean indicating whether NMS is applied in batch
using `tf.image.combined_non_max_suppression`. Currently only available
in CPU/GPU. Default: False.
apply_sigmoid_to_score: a boolean indicating whether apply sigmoid to
`rpn_scores` before applying NMS. Default: True.
is_box_lrtb: a bool indicating whether boxes are in lrtb (=left,right,top,
bottom) format.
rpn_object_scores: a predicted objectness score (e.g., centerness). In
OLN, we use object_scores=centerness as a replacement of the scores at
each level. A dict with keys representing FPN levels and values
representing logit tensors of shape [batch_size, feature_h, feature_w,
num_anchors].
Returns:
selected_rois: a tensor of shape [batch_size, rpn_post_nms_top_k, 4],
representing the box coordinates of the selected proposals w.r.t. the
scaled image.
selected_roi_scores: a tensor of shape [batch_size, rpn_post_nms_top_k,
1],representing the scores of the selected proposals.
"""
with tf.name_scope('multilevel_propose_rois'):
rois = []
roi_scores = []
image_shape = tf.expand_dims(image_shape, axis=1)
for level in sorted(rpn_scores.keys()):
with tf.name_scope('level_%d' % level):
_, feature_h, feature_w, num_anchors_per_location = (
rpn_scores[level].get_shape().as_list())
num_boxes = feature_h * feature_w * num_anchors_per_location
this_level_scores = tf.reshape(rpn_scores[level], [-1, num_boxes])
this_level_boxes = tf.reshape(rpn_boxes[level], [-1, num_boxes, 4])
this_level_anchors = tf.cast(
tf.reshape(anchor_boxes[level], [-1, num_boxes, 4]),
dtype=this_level_scores.dtype)
if rpn_object_scores:
this_level_object_scores = rpn_object_scores[level]
this_level_object_scores = tf.reshape(this_level_object_scores,
[-1, num_boxes])
this_level_object_scores = tf.cast(this_level_object_scores,
this_level_scores.dtype)
this_level_scores = this_level_object_scores
if apply_sigmoid_to_score:
this_level_scores = tf.sigmoid(this_level_scores)
if decode_boxes:
if is_box_lrtb: # Box in left-right-top-bottom format.
this_level_boxes = box_utils.decode_boxes_lrtb(
this_level_boxes, this_level_anchors)
else: # Box in standard x-y-h-w format.
this_level_boxes = box_utils.decode_boxes(
this_level_boxes, this_level_anchors)
if clip_boxes:
this_level_boxes = box_utils.clip_boxes(
this_level_boxes, image_shape)
if rpn_min_size_threshold > 0.0:
this_level_boxes, this_level_scores = box_utils.filter_boxes(
this_level_boxes, this_level_scores, image_shape,
rpn_min_size_threshold)
this_level_pre_nms_top_k = min(num_boxes, rpn_pre_nms_top_k)
this_level_post_nms_top_k = min(num_boxes, rpn_post_nms_top_k)
if rpn_nms_threshold > 0.0:
if use_batched_nms:
this_level_rois, this_level_roi_scores, _, _ = (
tf.image.combined_non_max_suppression(
tf.expand_dims(this_level_boxes, axis=2),
tf.expand_dims(this_level_scores, axis=-1),
max_output_size_per_class=this_level_pre_nms_top_k,
max_total_size=this_level_post_nms_top_k,
iou_threshold=rpn_nms_threshold,
score_threshold=rpn_score_threshold,
pad_per_class=False,
clip_boxes=False))
else:
if rpn_score_threshold > 0.0:
this_level_boxes, this_level_scores = (
box_utils.filter_boxes_by_scores(this_level_boxes,
this_level_scores,
rpn_score_threshold))
this_level_boxes, this_level_scores = box_utils.top_k_boxes(
this_level_boxes, this_level_scores,
k=this_level_pre_nms_top_k)
this_level_roi_scores, this_level_rois = (
nms.sorted_non_max_suppression_padded(
this_level_scores,
this_level_boxes,
max_output_size=this_level_post_nms_top_k,
iou_threshold=rpn_nms_threshold))
else:
this_level_rois, this_level_roi_scores = box_utils.top_k_boxes(
this_level_rois, this_level_scores, k=this_level_post_nms_top_k)
rois.append(this_level_rois)
roi_scores.append(this_level_roi_scores)
all_rois = tf.concat(rois, axis=1)
all_roi_scores = tf.concat(roi_scores, axis=1)
with tf.name_scope('top_k_rois'):
_, num_valid_rois = all_roi_scores.get_shape().as_list()
overall_top_k = min(num_valid_rois, rpn_post_nms_top_k)
selected_rois, selected_roi_scores = box_utils.top_k_boxes(
all_rois, all_roi_scores, k=overall_top_k)
return selected_rois, selected_roi_scores
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions to performa spatial transformation for Tensor."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
_EPSILON = 1e-8
def nearest_upsampling(data, scale):
"""Nearest neighbor upsampling implementation.
Args:
data: A tensor with a shape of [batch, height_in, width_in, channels].
scale: An integer multiple to scale resolution of input data.
Returns:
data_up: A tensor with a shape of
[batch, height_in*scale, width_in*scale, channels]. Same dtype as input
data.
"""
with tf.name_scope('nearest_upsampling'):
bs, _, _, c = data.get_shape().as_list()
shape = tf.shape(input=data)
h = shape[1]
w = shape[2]
bs = -1 if bs is None else bs
# Uses reshape to quickly upsample the input. The nearest pixel is selected
# implicitly via broadcasting.
data = tf.reshape(data, [bs, h, 1, w, 1, c]) * tf.ones(
[1, 1, scale, 1, scale, 1], dtype=data.dtype)
return tf.reshape(data, [bs, h * scale, w * scale, c])
def feature_bilinear_interpolation(features, kernel_y, kernel_x):
"""Feature bilinear interpolation.
The RoIAlign feature f can be computed by bilinear interpolation
of four neighboring feature points f0, f1, f2, and f3.
f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
[f10, f11]]
f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
kernel_y = [hy, ly]
kernel_x = [hx, lx]
Args:
features: The features are in shape of [batch_size, num_boxes, output_size *
2, output_size * 2, num_filters].
kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
Returns:
A 5-D tensor representing feature crop of shape
[batch_size, num_boxes, output_size, output_size, num_filters].
"""
(batch_size, num_boxes, output_size, _,
num_filters) = features.get_shape().as_list()
output_size = output_size // 2
kernel_y = tf.reshape(kernel_y, [batch_size, num_boxes, output_size * 2, 1])
kernel_x = tf.reshape(kernel_x, [batch_size, num_boxes, 1, output_size * 2])
# Use implicit broadcast to generate the interpolation kernel. The
# multiplier `4` is for avg pooling.
interpolation_kernel = kernel_y * kernel_x * 4
# Interpolate the gathered features with computed interpolation kernels.
features *= tf.cast(
tf.expand_dims(interpolation_kernel, axis=-1), dtype=features.dtype)
features = tf.reshape(
features,
[batch_size * num_boxes, output_size * 2, output_size * 2, num_filters])
features = tf.nn.avg_pool(features, [1, 2, 2, 1], [1, 2, 2, 1], 'VALID')
features = tf.reshape(
features, [batch_size, num_boxes, output_size, output_size, num_filters])
return features
def compute_grid_positions(boxes, boundaries, output_size, sample_offset):
"""Compute the grid position w.r.t.
the corresponding feature map.
Args:
boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
information of each box w.r.t. the corresponding feature map.
boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
in terms of the number of pixels of the corresponding feature map size.
boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
the boundary (in (y, x)) of the corresponding feature map for each box.
Any resampled grid points that go beyond the bounary will be clipped.
output_size: a scalar indicating the output crop size.
sample_offset: a float number in [0, 1] indicates the subpixel sample offset
from grid point.
Returns:
kernel_y: Tensor of size [batch_size, boxes, output_size, 2, 1].
kernel_x: Tensor of size [batch_size, boxes, output_size, 2, 1].
box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2]
box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2]
"""
batch_size, num_boxes, _ = boxes.get_shape().as_list()
box_grid_x = []
box_grid_y = []
for i in range(output_size):
box_grid_x.append(boxes[:, :, 1] +
(i + sample_offset) * boxes[:, :, 3] / output_size)
box_grid_y.append(boxes[:, :, 0] +
(i + sample_offset) * boxes[:, :, 2] / output_size)
box_grid_x = tf.stack(box_grid_x, axis=2)
box_grid_y = tf.stack(box_grid_y, axis=2)
box_grid_y0 = tf.floor(box_grid_y)
box_grid_x0 = tf.floor(box_grid_x)
box_grid_x0 = tf.maximum(0., box_grid_x0)
box_grid_y0 = tf.maximum(0., box_grid_y0)
box_grid_x0 = tf.minimum(box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1))
box_grid_x1 = tf.minimum(box_grid_x0 + 1,
tf.expand_dims(boundaries[:, :, 1], -1))
box_grid_y0 = tf.minimum(box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1))
box_grid_y1 = tf.minimum(box_grid_y0 + 1,
tf.expand_dims(boundaries[:, :, 0], -1))
box_gridx0x1 = tf.stack([box_grid_x0, box_grid_x1], axis=-1)
box_gridy0y1 = tf.stack([box_grid_y0, box_grid_y1], axis=-1)
# The RoIAlign feature f can be computed by bilinear interpolation of four
# neighboring feature points f0, f1, f2, and f3.
# f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
# [f10, f11]]
# f(y, x) = (hy*hx)f00 + (hy*lx)f01 + (ly*hx)f10 + (lx*ly)f11
# f(y, x) = w00*f00 + w01*f01 + w10*f10 + w11*f11
ly = box_grid_y - box_grid_y0
lx = box_grid_x - box_grid_x0
hy = 1.0 - ly
hx = 1.0 - lx
kernel_y = tf.reshape(
tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1])
kernel_x = tf.reshape(
tf.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1])
return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1
def get_grid_one_hot(box_gridy0y1, box_gridx0x1, feature_height, feature_width):
"""Get grid_one_hot from indices and feature_size."""
(batch_size, num_boxes, output_size, _) = box_gridx0x1.get_shape().as_list()
y_indices = tf.cast(
tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size, 2]),
dtype=tf.int32)
x_indices = tf.cast(
tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size, 2]),
dtype=tf.int32)
# shape is [batch_size, num_boxes, output_size, 2, height]
grid_y_one_hot = tf.one_hot(tf.cast(y_indices, tf.int32), feature_height)
# shape is [batch_size, num_boxes, output_size, 2, width]
grid_x_one_hot = tf.one_hot(tf.cast(x_indices, tf.int32), feature_width)
return grid_y_one_hot, grid_x_one_hot
def selective_crop_and_resize(features,
boxes,
box_levels,
boundaries,
output_size=7,
sample_offset=0.5,
use_einsum_gather=False):
"""Crop and resize boxes on a set of feature maps.
Given multiple features maps indexed by different levels, and a set of boxes
where each box is mapped to a certain level, it selectively crops and resizes
boxes from the corresponding feature maps to generate the box features.
We follow the ROIAlign technique (see https://arxiv.org/pdf/1703.06870.pdf,
figure 3 for reference). Specifically, for each feature map, we select an
(output_size, output_size) set of pixels corresponding to the box location,
and then use bilinear interpolation to select the feature value for each
pixel.
For performance, we perform the gather and interpolation on all layers as a
single operation. In this op the multi-level features are first stacked and
gathered into [2*output_size, 2*output_size] feature points. Then bilinear
interpolation is performed on the gathered feature points to generate
[output_size, output_size] RoIAlign feature map.
Here is the step-by-step algorithm:
1. The multi-level features are gathered into a
[batch_size, num_boxes, output_size*2, output_size*2, num_filters]
Tensor. The Tensor contains four neighboring feature points for each
vertice in the output grid.
2. Compute the interpolation kernel of shape
[batch_size, num_boxes, output_size*2, output_size*2]. The last 2 axis
can be seen as stacking 2x2 interpolation kernels for all vertices in the
output grid.
3. Element-wise multiply the gathered features and interpolation kernel.
Then apply 2x2 average pooling to reduce spatial dimension to
output_size.
Args:
features: a 5-D tensor of shape [batch_size, num_levels, max_height,
max_width, num_filters] where cropping and resizing are based.
boxes: a 3-D tensor of shape [batch_size, num_boxes, 4] encoding the
information of each box w.r.t. the corresponding feature map.
boxes[:, :, 0:2] are the grid position in (y, x) (float) of the top-left
corner of each box. boxes[:, :, 2:4] are the box sizes in (h, w) (float)
in terms of the number of pixels of the corresponding feature map size.
box_levels: a 3-D tensor of shape [batch_size, num_boxes, 1] representing
the 0-based corresponding feature level index of each box.
boundaries: a 3-D tensor of shape [batch_size, num_boxes, 2] representing
the boundary (in (y, x)) of the corresponding feature map for each box.
Any resampled grid points that go beyond the bounary will be clipped.
output_size: a scalar indicating the output crop size.
sample_offset: a float number in [0, 1] indicates the subpixel sample offset
from grid point.
use_einsum_gather: use einsum to replace gather or not. Replacing einsum
with gather can improve performance when feature size is not large, einsum
is friendly with model partition as well. Gather's performance is better
when feature size is very large and there are multiple box levels.
Returns:
features_per_box: a 5-D tensor of shape
[batch_size, num_boxes, output_size, output_size, num_filters]
representing the cropped features.
"""
(batch_size, num_levels, max_feature_height, max_feature_width,
num_filters) = features.get_shape().as_list()
_, num_boxes, _ = boxes.get_shape().as_list()
kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
boxes, boundaries, output_size, sample_offset)
x_indices = tf.cast(
tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
dtype=tf.int32)
y_indices = tf.cast(
tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
dtype=tf.int32)
if use_einsum_gather:
# Blinear interpolation is done during the last two gathers:
# f(y, x) = [hy, ly] * [[f00, f01], * [hx, lx]^T
# [f10, f11]]
# [[f00, f01],
# [f10, f11]] = tf.einsum(tf.einsum(features, y_one_hot), x_one_hot)
# where [hy, ly] and [hx, lx] are the bilinear interpolation kernel.
# shape is [batch_size, boxes, output_size, 2, 1]
grid_y_one_hot, grid_x_one_hot = get_grid_one_hot(box_gridy0y1,
box_gridx0x1,
max_feature_height,
max_feature_width)
# shape is [batch_size, num_boxes, output_size, height]
grid_y_weight = tf.reduce_sum(
tf.multiply(grid_y_one_hot, kernel_y), axis=-2)
# shape is [batch_size, num_boxes, output_size, width]
grid_x_weight = tf.reduce_sum(
tf.multiply(grid_x_one_hot, kernel_x), axis=-2)
# Gather for y_axis.
# shape is [batch_size, num_boxes, output_size, width, features]
features_per_box = tf.einsum('bmhwf,bmoh->bmowf', features,
tf.cast(grid_y_weight, features.dtype))
# Gather for x_axis.
# shape is [batch_size, num_boxes, output_size, output_size, features]
features_per_box = tf.einsum('bmhwf,bmow->bmhof', features_per_box,
tf.cast(grid_x_weight, features.dtype))
else:
height_dim_offset = max_feature_width
level_dim_offset = max_feature_height * height_dim_offset
batch_dim_offset = num_levels * level_dim_offset
batch_size_offset = tf.tile(
tf.reshape(
tf.range(batch_size) * batch_dim_offset, [batch_size, 1, 1, 1]),
[1, num_boxes, output_size * 2, output_size * 2])
box_levels_offset = tf.tile(
tf.reshape(box_levels * level_dim_offset,
[batch_size, num_boxes, 1, 1]),
[1, 1, output_size * 2, output_size * 2])
y_indices_offset = tf.tile(
tf.reshape(y_indices * height_dim_offset,
[batch_size, num_boxes, output_size * 2, 1]),
[1, 1, 1, output_size * 2])
x_indices_offset = tf.tile(
tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
[1, 1, output_size * 2, 1])
indices = tf.reshape(
batch_size_offset + box_levels_offset + y_indices_offset +
x_indices_offset, [-1])
features = tf.reshape(features, [-1, num_filters])
# TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
# performance.
features_per_box = tf.reshape(
tf.gather(features, indices),
[batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
features_per_box = feature_bilinear_interpolation(features_per_box,
kernel_y, kernel_x)
return features_per_box
def multilevel_crop_and_resize(features, boxes, output_size=7):
"""Crop and resize on multilevel feature pyramid.
Generate the (output_size, output_size) set of pixels for each input box
by first locating the box into the correct feature level, and then cropping
and resizing it using the correspoding feature map of that level.
Args:
features: A dictionary with key as pyramid level and value as features. The
features are in shape of [batch_size, height_l, width_l, num_filters].
boxes: A 3-D Tensor of shape [batch_size, num_boxes, 4]. Each row represents
a box with [y1, x1, y2, x2] in un-normalized coordinates.
output_size: A scalar to indicate the output crop size.
Returns:
A 5-D tensor representing feature crop of shape
[batch_size, num_boxes, output_size, output_size, num_filters].
"""
with tf.name_scope('multilevel_crop_and_resize'):
levels = list(features.keys())
min_level = min(levels)
max_level = max(levels)
batch_size, max_feature_height, max_feature_width, num_filters = (
features[min_level].get_shape().as_list())
_, num_boxes, _ = boxes.get_shape().as_list()
# Stack feature pyramid into a features_all of shape
# [batch_size, levels, height, width, num_filters].
features_all = []
feature_heights = []
feature_widths = []
for level in range(min_level, max_level + 1):
shape = features[level].get_shape().as_list()
feature_heights.append(shape[1])
feature_widths.append(shape[2])
# Concat tensor of [batch_size, height_l * width_l, num_filters] for each
# levels.
features_all.append(
tf.reshape(features[level], [batch_size, -1, num_filters]))
features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters])
# Calculate height_l * width_l for each level.
level_dim_sizes = [
feature_widths[i] * feature_heights[i]
for i in range(len(feature_widths))
]
# level_dim_offsets is accumulated sum of level_dim_size.
level_dim_offsets = [0]
for i in range(len(feature_widths) - 1):
level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i])
batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1]
level_dim_offsets = tf.constant(level_dim_offsets, tf.int32)
height_dim_sizes = tf.constant(feature_widths, tf.int32)
# Assigns boxes to the right level.
box_width = boxes[:, :, 3] - boxes[:, :, 1]
box_height = boxes[:, :, 2] - boxes[:, :, 0]
areas_sqrt = tf.sqrt(box_height * box_width)
levels = tf.cast(
tf.math.floordiv(
tf.math.log(tf.divide(areas_sqrt, 224.0)), tf.math.log(2.0)) + 4.0,
dtype=tf.int32)
# Maps levels between [min_level, max_level].
levels = tf.minimum(max_level, tf.maximum(levels, min_level))
# Projects box location and sizes to corresponding feature levels.
scale_to_level = tf.cast(
tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)),
dtype=boxes.dtype)
boxes /= tf.expand_dims(scale_to_level, axis=2)
box_width /= scale_to_level
box_height /= scale_to_level
boxes = tf.concat([
boxes[:, :, 0:2],
tf.expand_dims(box_height, -1),
tf.expand_dims(box_width, -1)
],
axis=-1)
# Maps levels to [0, max_level-min_level].
levels -= min_level
level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
boundary = tf.cast(
tf.concat([
tf.expand_dims(
[[tf.cast(max_feature_height, tf.float32)]] / level_strides - 1,
axis=-1),
tf.expand_dims(
[[tf.cast(max_feature_width, tf.float32)]] / level_strides - 1,
axis=-1),
],
axis=-1), boxes.dtype)
# Compute grid positions.
kernel_y, kernel_x, box_gridy0y1, box_gridx0x1 = compute_grid_positions(
boxes, boundary, output_size, sample_offset=0.5)
x_indices = tf.cast(
tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
dtype=tf.int32)
y_indices = tf.cast(
tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
dtype=tf.int32)
batch_size_offset = tf.tile(
tf.reshape(
tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
[1, num_boxes, output_size * 2, output_size * 2])
# Get level offset for each box. Each box belongs to one level.
levels_offset = tf.tile(
tf.reshape(
tf.gather(level_dim_offsets, levels),
[batch_size, num_boxes, 1, 1]),
[1, 1, output_size * 2, output_size * 2])
y_indices_offset = tf.tile(
tf.reshape(
y_indices * tf.expand_dims(tf.gather(height_dim_sizes, levels), -1),
[batch_size, num_boxes, output_size * 2, 1]),
[1, 1, 1, output_size * 2])
x_indices_offset = tf.tile(
tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
[1, 1, output_size * 2, 1])
indices = tf.reshape(
batch_size_offset + levels_offset + y_indices_offset + x_indices_offset,
[-1])
# TODO(wangtao): replace tf.gather with tf.gather_nd and try to get similar
# performance.
features_per_box = tf.reshape(
tf.gather(features_r2, indices),
[batch_size, num_boxes, output_size * 2, output_size * 2, num_filters])
# Bilinear interpolation.
features_per_box = feature_bilinear_interpolation(features_per_box,
kernel_y, kernel_x)
return features_per_box
def single_level_feature_crop(features, level_boxes, detection_prior_levels,
min_mask_level, mask_crop_size):
"""Crop the FPN features at the appropriate levels for each detection.
Args:
features: a float tensor of shape [batch_size, num_levels, max_feature_size,
max_feature_size, num_downsample_channels].
level_boxes: a float Tensor of the level boxes to crop from. [batch_size,
num_instances, 4].
detection_prior_levels: an int Tensor of instance assigned level of shape
[batch_size, num_instances].
min_mask_level: minimum FPN level to crop mask feature from.
mask_crop_size: an int of mask crop size.
Returns:
crop_features: a float Tensor of shape [batch_size * num_instances,
mask_crop_size, mask_crop_size, num_downsample_channels]. This is the
instance feature crop.
"""
(batch_size, num_levels, max_feature_size, _,
num_downsample_channels) = features.get_shape().as_list()
_, num_of_instances, _ = level_boxes.get_shape().as_list()
level_boxes = tf.cast(level_boxes, tf.int32)
assert num_of_instances == detection_prior_levels.get_shape().as_list()[1]
x_start_indices = level_boxes[:, :, 1]
y_start_indices = level_boxes[:, :, 0]
# generate the full indices (not just the starting index)
x_idx_list = []
y_idx_list = []
for i in range(mask_crop_size):
x_idx_list.append(x_start_indices + i)
y_idx_list.append(y_start_indices + i)
x_indices = tf.stack(x_idx_list, axis=2)
y_indices = tf.stack(y_idx_list, axis=2)
levels = detection_prior_levels - min_mask_level
height_dim_size = max_feature_size
level_dim_size = max_feature_size * height_dim_size
batch_dim_size = num_levels * level_dim_size
# TODO(weicheng) change this to gather_nd for better readability.
indices = tf.reshape(
tf.tile(
tf.reshape(
tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]),
[1, num_of_instances, mask_crop_size, mask_crop_size]) + tf.tile(
tf.reshape(levels * level_dim_size,
[batch_size, num_of_instances, 1, 1]),
[1, 1, mask_crop_size, mask_crop_size]) + tf.tile(
tf.reshape(y_indices * height_dim_size,
[batch_size, num_of_instances, mask_crop_size, 1]),
[1, 1, 1, mask_crop_size]) +
tf.tile(
tf.reshape(x_indices,
[batch_size, num_of_instances, 1, mask_crop_size]),
[1, 1, mask_crop_size, 1]), [-1])
features_r2 = tf.reshape(features, [-1, num_downsample_channels])
crop_features = tf.reshape(
tf.gather(features_r2, indices), [
batch_size * num_of_instances, mask_crop_size, mask_crop_size,
num_downsample_channels
])
return crop_features
def crop_mask_in_target_box(masks,
boxes,
target_boxes,
output_size,
sample_offset=0,
use_einsum=True):
"""Crop masks in target boxes.
Args:
masks: A tensor with a shape of [batch_size, num_masks, height, width].
boxes: a float tensor representing box cooridnates that tightly enclose
masks with a shape of [batch_size, num_masks, 4] in un-normalized
coordinates. A box is represented by [ymin, xmin, ymax, xmax].
target_boxes: a float tensor representing target box cooridnates for masks
with a shape of [batch_size, num_masks, 4] in un-normalized coordinates. A
box is represented by [ymin, xmin, ymax, xmax].
output_size: A scalar to indicate the output crop size. It currently only
supports to output a square shape outputs.
sample_offset: a float number in [0, 1] indicates the subpixel sample offset
from grid point.
use_einsum: Use einsum to replace gather in selective_crop_and_resize.
Returns:
A 4-D tensor representing feature crop of shape
[batch_size, num_boxes, output_size, output_size].
"""
with tf.name_scope('crop_mask_in_target_box'):
batch_size, num_masks, height, width = masks.get_shape().as_list()
masks = tf.reshape(masks, [batch_size * num_masks, height, width, 1])
# Pad zeros on the boundary of masks.
masks = tf.image.pad_to_bounding_box(masks, 2, 2, height + 4, width + 4)
masks = tf.reshape(masks, [batch_size, num_masks, height + 4, width + 4, 1])
# Projects target box locations and sizes to corresponding cropped
# mask coordinates.
gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
value=boxes, num_or_size_splits=4, axis=2)
bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
value=target_boxes, num_or_size_splits=4, axis=2)
y_transform = (bb_y_min - gt_y_min) * height / (gt_y_max - gt_y_min +
_EPSILON) + 2
x_transform = (bb_x_min - gt_x_min) * height / (gt_x_max - gt_x_min +
_EPSILON) + 2
h_transform = (bb_y_max - bb_y_min) * width / (
gt_y_max - gt_y_min + _EPSILON)
w_transform = (bb_x_max - bb_x_min) * width / (
gt_x_max - gt_x_min + _EPSILON)
boundaries = tf.concat([
tf.cast(
tf.ones_like(y_transform) * ((height + 4) - 1), dtype=tf.float32),
tf.cast(
tf.ones_like(x_transform) * ((width + 4) - 1), dtype=tf.float32)
],
axis=-1)
# Reshape tensors to have the right shape for selective_crop_and_resize.
trasnformed_boxes = tf.concat(
[y_transform, x_transform, h_transform, w_transform], -1)
levels = tf.tile(
tf.reshape(tf.range(num_masks), [1, num_masks]), [batch_size, 1])
cropped_masks = selective_crop_and_resize(
masks,
trasnformed_boxes,
levels,
boundaries,
output_size,
sample_offset=sample_offset,
use_einsum_gather=use_einsum)
cropped_masks = tf.squeeze(cropped_masks, axis=-1)
return cropped_masks
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Target and sampling related ops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from official.legacy.detection.ops import spatial_transform_ops
from official.legacy.detection.utils import box_utils
from official.vision.utils.object_detection import balanced_positive_negative_sampler
def box_matching(boxes, gt_boxes, gt_classes):
"""Match boxes to groundtruth boxes.
Given the proposal boxes and the groundtruth boxes and classes, perform the
groundtruth matching by taking the argmax of the IoU between boxes and
groundtruth boxes.
Args:
boxes: a tensor of shape of [batch_size, N, 4] representing the box
coordiantes to be matched to groundtruth boxes.
gt_boxes: a tensor of shape of [batch_size, MAX_INSTANCES, 4] representing
the groundtruth box coordinates. It is padded with -1s to indicate the
invalid boxes.
gt_classes: [batch_size, MAX_INSTANCES] representing the groundtruth box
classes. It is padded with -1s to indicate the invalid classes.
Returns:
matched_gt_boxes: a tensor of shape of [batch_size, N, 4], representing
the matched groundtruth box coordinates for each input box. If the box
does not overlap with any groundtruth boxes, the matched boxes of it
will be set to all 0s.
matched_gt_classes: a tensor of shape of [batch_size, N], representing
the matched groundtruth classes for each input box. If the box does not
overlap with any groundtruth boxes, the matched box classes of it will
be set to 0, which corresponds to the background class.
matched_gt_indices: a tensor of shape of [batch_size, N], representing
the indices of the matched groundtruth boxes in the original gt_boxes
tensor. If the box does not overlap with any groundtruth boxes, the
index of the matched groundtruth will be set to -1.
matched_iou: a tensor of shape of [batch_size, N], representing the IoU
between the box and its matched groundtruth box. The matched IoU is the
maximum IoU of the box and all the groundtruth boxes.
iou: a tensor of shape of [batch_size, N, K], representing the IoU matrix
between boxes and the groundtruth boxes. The IoU between a box and the
invalid groundtruth boxes whose coordinates are [-1, -1, -1, -1] is -1.
"""
# Compute IoU between boxes and gt_boxes.
# iou <- [batch_size, N, K]
iou = box_utils.bbox_overlap(boxes, gt_boxes)
# max_iou <- [batch_size, N]
# 0.0 -> no match to gt, or -1.0 match to no gt
matched_iou = tf.reduce_max(iou, axis=-1)
# background_box_mask <- bool, [batch_size, N]
background_box_mask = tf.less_equal(matched_iou, 0.0)
argmax_iou_indices = tf.argmax(iou, axis=-1, output_type=tf.int32)
argmax_iou_indices_shape = tf.shape(argmax_iou_indices)
batch_indices = (
tf.expand_dims(tf.range(argmax_iou_indices_shape[0]), axis=-1) *
tf.ones([1, argmax_iou_indices_shape[-1]], dtype=tf.int32))
gather_nd_indices = tf.stack([batch_indices, argmax_iou_indices], axis=-1)
matched_gt_boxes = tf.gather_nd(gt_boxes, gather_nd_indices)
matched_gt_boxes = tf.where(
tf.tile(tf.expand_dims(background_box_mask, axis=-1), [1, 1, 4]),
tf.zeros_like(matched_gt_boxes, dtype=matched_gt_boxes.dtype),
matched_gt_boxes)
matched_gt_classes = tf.gather_nd(gt_classes, gather_nd_indices)
matched_gt_classes = tf.where(background_box_mask,
tf.zeros_like(matched_gt_classes),
matched_gt_classes)
matched_gt_indices = tf.where(background_box_mask,
-tf.ones_like(argmax_iou_indices),
argmax_iou_indices)
return (matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
iou)
def assign_and_sample_proposals(proposed_boxes,
gt_boxes,
gt_classes,
num_samples_per_image=512,
mix_gt_boxes=True,
fg_fraction=0.25,
fg_iou_thresh=0.5,
bg_iou_thresh_hi=0.5,
bg_iou_thresh_lo=0.0):
"""Assigns the proposals with groundtruth classes and performs subsmpling.
Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
following algorithm to generate the final `num_samples_per_image` RoIs.
1. Calculates the IoU between each proposal box and each gt_boxes.
2. Assigns each proposed box with a groundtruth class and box by choosing
the largest IoU overlap.
3. Samples `num_samples_per_image` boxes from all proposed boxes, and
returns box_targets, class_targets, and RoIs.
Args:
proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment. The last dimension is the box
coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
coordinates of gt_boxes are in the pixel coordinates of the scaled image.
This tensor might have padding of values -1 indicating the invalid box
coordinates.
gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
tensor might have paddings with values of -1 indicating the invalid
classes.
num_samples_per_image: a integer represents RoI minibatch size per image.
mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes before
sampling proposals.
fg_fraction: a float represents the target fraction of RoI minibatch that is
labeled foreground (i.e., class > 0).
fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to be
considered foreground (if >= fg_iou_thresh).
bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI to
be considered background (class = 0 if overlap in [LO, HI)).
bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI to
be considered background (class = 0 if overlap in [LO, HI)).
Returns:
sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
coordinates of the sampled RoIs, where K is the number of the sampled
RoIs, i.e. K = num_samples_per_image.
sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
box coordinates of the matched groundtruth boxes of the samples RoIs.
sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
indices of the sampled groudntruth boxes in the original `gt_boxes`
tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] = sampled_gt_boxes[:, i].
"""
with tf.name_scope('sample_proposals'):
if mix_gt_boxes:
boxes = tf.concat([proposed_boxes, gt_boxes], axis=1)
else:
boxes = proposed_boxes
(matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
_) = box_matching(boxes, gt_boxes, gt_classes)
positive_match = tf.greater(matched_iou, fg_iou_thresh)
negative_match = tf.logical_and(
tf.greater_equal(matched_iou, bg_iou_thresh_lo),
tf.less(matched_iou, bg_iou_thresh_hi))
ignored_match = tf.less(matched_iou, 0.0)
# re-assign negatively matched boxes to the background class.
matched_gt_classes = tf.where(negative_match,
tf.zeros_like(matched_gt_classes),
matched_gt_classes)
matched_gt_indices = tf.where(negative_match,
tf.zeros_like(matched_gt_indices),
matched_gt_indices)
sample_candidates = tf.logical_and(
tf.logical_or(positive_match, negative_match),
tf.logical_not(ignored_match))
sampler = (
balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
positive_fraction=fg_fraction, is_static=True))
batch_size, _ = sample_candidates.get_shape().as_list()
sampled_indicators = []
for i in range(batch_size):
sampled_indicator = sampler.subsample(sample_candidates[i],
num_samples_per_image,
positive_match[i])
sampled_indicators.append(sampled_indicator)
sampled_indicators = tf.stack(sampled_indicators)
_, sampled_indices = tf.nn.top_k(
tf.cast(sampled_indicators, dtype=tf.int32),
k=num_samples_per_image,
sorted=True)
sampled_indices_shape = tf.shape(sampled_indices)
batch_indices = (
tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) *
tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32))
gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1)
sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices)
sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices)
return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
sampled_gt_indices)
def sample_and_crop_foreground_masks(candidate_rois,
candidate_gt_boxes,
candidate_gt_classes,
candidate_gt_indices,
gt_masks,
num_mask_samples_per_image=128,
mask_target_size=28):
"""Samples and creates cropped foreground masks for training.
Args:
candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
number of candidate RoIs to be considered for mask sampling. It includes
both positive and negative RoIs. The `num_mask_samples_per_image` positive
RoIs will be sampled to create mask training targets.
candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
corresponding groundtruth boxes to the `candidate_rois`.
candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
corresponding groundtruth classes to the `candidate_rois`. 0 in the tensor
corresponds to the background class, i.e. negative RoIs.
candidate_gt_indices: a tensor of shape [batch_size, N], storing the
corresponding groundtruth instance indices to the `candidate_gt_boxes`,
i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i] and
gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >= N, is
the superset of candidate_gt_boxes.
gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
containing all the groundtruth masks which sample masks are drawn from.
num_mask_samples_per_image: an integer which specifies the number of masks
to sample.
mask_target_size: an integer which specifies the final cropped mask size
after sampling. The output masks are resized w.r.t the sampled RoIs.
Returns:
foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
that corresponds to the sampled foreground masks, where
K = num_mask_samples_per_image.
foreground_classes: a tensor of shape of [batch_size, K] storing the classes
corresponding to the sampled foreground masks.
cropoped_foreground_masks: a tensor of shape of
[batch_size, K, mask_target_size, mask_target_size] storing the cropped
foreground masks used for training.
"""
with tf.name_scope('sample_and_crop_foreground_masks'):
_, fg_instance_indices = tf.nn.top_k(
tf.cast(tf.greater(candidate_gt_classes, 0), dtype=tf.int32),
k=num_mask_samples_per_image)
fg_instance_indices_shape = tf.shape(fg_instance_indices)
batch_indices = (
tf.expand_dims(tf.range(fg_instance_indices_shape[0]), axis=-1) *
tf.ones([1, fg_instance_indices_shape[-1]], dtype=tf.int32))
gather_nd_instance_indices = tf.stack([batch_indices, fg_instance_indices],
axis=-1)
foreground_rois = tf.gather_nd(candidate_rois, gather_nd_instance_indices)
foreground_boxes = tf.gather_nd(candidate_gt_boxes,
gather_nd_instance_indices)
foreground_classes = tf.gather_nd(candidate_gt_classes,
gather_nd_instance_indices)
foreground_gt_indices = tf.gather_nd(candidate_gt_indices,
gather_nd_instance_indices)
foreground_gt_indices_shape = tf.shape(foreground_gt_indices)
batch_indices = (
tf.expand_dims(tf.range(foreground_gt_indices_shape[0]), axis=-1) *
tf.ones([1, foreground_gt_indices_shape[-1]], dtype=tf.int32))
gather_nd_gt_indices = tf.stack([batch_indices, foreground_gt_indices],
axis=-1)
foreground_masks = tf.gather_nd(gt_masks, gather_nd_gt_indices)
cropped_foreground_masks = spatial_transform_ops.crop_mask_in_target_box(
foreground_masks,
foreground_boxes,
foreground_rois,
mask_target_size,
sample_offset=0.5)
return foreground_rois, foreground_classes, cropped_foreground_masks
class ROISampler(tf.keras.layers.Layer):
"""Samples RoIs and creates training targets."""
def __init__(self, params):
self._num_samples_per_image = params.num_samples_per_image
self._fg_fraction = params.fg_fraction
self._fg_iou_thresh = params.fg_iou_thresh
self._bg_iou_thresh_hi = params.bg_iou_thresh_hi
self._bg_iou_thresh_lo = params.bg_iou_thresh_lo
self._mix_gt_boxes = params.mix_gt_boxes
super(ROISampler, self).__init__(autocast=False)
def call(self, rois, gt_boxes, gt_classes):
"""Sample and assign RoIs for training.
Args:
rois: a tensor of shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment. The last dimension is the box
coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
coordinates of gt_boxes are in the pixel coordinates of the scaled
image. This tensor might have padding of values -1 indicating the
invalid box coordinates.
gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
tensor might have paddings with values of -1 indicating the invalid
classes.
Returns:
sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
coordinates of the sampled RoIs, where K is the number of the sampled
RoIs, i.e. K = num_samples_per_image.
sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
box coordinates of the matched groundtruth boxes of the samples RoIs.
sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
"""
sampled_rois, sampled_gt_boxes, sampled_gt_classes, sampled_gt_indices = (
assign_and_sample_proposals(
rois,
gt_boxes,
gt_classes,
num_samples_per_image=self._num_samples_per_image,
mix_gt_boxes=self._mix_gt_boxes,
fg_fraction=self._fg_fraction,
fg_iou_thresh=self._fg_iou_thresh,
bg_iou_thresh_hi=self._bg_iou_thresh_hi,
bg_iou_thresh_lo=self._bg_iou_thresh_lo))
return (sampled_rois, sampled_gt_boxes, sampled_gt_classes,
sampled_gt_indices)
class ROIScoreSampler(ROISampler):
"""Samples RoIs, RoI-scores and creates training targets."""
def __call__(self, rois, roi_scores, gt_boxes, gt_classes):
"""Sample and assign RoIs for training.
Args:
rois: a tensor of shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment. The last dimension is the box
coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax] format.
roi_scores:
gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
coordinates of gt_boxes are in the pixel coordinates of the scaled
image. This tensor might have padding of values -1 indicating the
invalid box coordinates.
gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
tensor might have paddings with values of -1 indicating the invalid
classes.
Returns:
sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
coordinates of the sampled RoIs, where K is the number of the sampled
RoIs, i.e. K = num_samples_per_image.
sampled_roi_scores:
sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
box coordinates of the matched groundtruth boxes of the samples RoIs.
sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
"""
(sampled_rois, sampled_roi_scores, sampled_gt_boxes, sampled_gt_classes,
sampled_gt_indices) = (
self.assign_and_sample_proposals_and_scores(
rois,
roi_scores,
gt_boxes,
gt_classes,
num_samples_per_image=self._num_samples_per_image,
mix_gt_boxes=self._mix_gt_boxes,
fg_fraction=self._fg_fraction,
fg_iou_thresh=self._fg_iou_thresh,
bg_iou_thresh_hi=self._bg_iou_thresh_hi,
bg_iou_thresh_lo=self._bg_iou_thresh_lo))
return (sampled_rois, sampled_roi_scores, sampled_gt_boxes,
sampled_gt_classes, sampled_gt_indices)
def assign_and_sample_proposals_and_scores(self,
proposed_boxes,
proposed_scores,
gt_boxes,
gt_classes,
num_samples_per_image=512,
mix_gt_boxes=True,
fg_fraction=0.25,
fg_iou_thresh=0.5,
bg_iou_thresh_hi=0.5,
bg_iou_thresh_lo=0.0):
"""Assigns the proposals with groundtruth classes and performs subsmpling.
Given `proposed_boxes`, `gt_boxes`, and `gt_classes`, the function uses the
following algorithm to generate the final `num_samples_per_image` RoIs.
1. Calculates the IoU between each proposal box and each gt_boxes.
2. Assigns each proposed box with a groundtruth class and box by choosing
the largest IoU overlap.
3. Samples `num_samples_per_image` boxes from all proposed boxes, and
returns box_targets, class_targets, and RoIs.
Args:
proposed_boxes: a tensor of shape of [batch_size, N, 4]. N is the number
of proposals before groundtruth assignment. The last dimension is the
box coordinates w.r.t. the scaled images in [ymin, xmin, ymax, xmax]
format.
proposed_scores: a tensor of shape of [batch_size, N]. N is the number of
proposals before groundtruth assignment. It is the rpn scores for all
proposed boxes which can be either their classification or centerness
scores.
gt_boxes: a tensor of shape of [batch_size, MAX_NUM_INSTANCES, 4]. The
coordinates of gt_boxes are in the pixel coordinates of the scaled
image. This tensor might have padding of values -1 indicating the
invalid box coordinates.
gt_classes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES]. This
tensor might have paddings with values of -1 indicating the invalid
classes.
num_samples_per_image: a integer represents RoI minibatch size per image.
mix_gt_boxes: a bool indicating whether to mix the groundtruth boxes
before sampling proposals.
fg_fraction: a float represents the target fraction of RoI minibatch that
is labeled foreground (i.e., class > 0).
fg_iou_thresh: a float represents the IoU overlap threshold for an RoI to
be considered foreground (if >= fg_iou_thresh).
bg_iou_thresh_hi: a float represents the IoU overlap threshold for an RoI
to be considered background (class = 0 if overlap in [LO, HI)).
bg_iou_thresh_lo: a float represents the IoU overlap threshold for an RoI
to be considered background (class = 0 if overlap in [LO, HI)).
Returns:
sampled_rois: a tensor of shape of [batch_size, K, 4], representing the
coordinates of the sampled RoIs, where K is the number of the sampled
RoIs, i.e. K = num_samples_per_image.
sampled_scores: a tensor of shape of [batch_size, K], representing the
confidence score of the sampled RoIs, where K is the number of the
sampled RoIs, i.e. K = num_samples_per_image.
sampled_gt_boxes: a tensor of shape of [batch_size, K, 4], storing the
box coordinates of the matched groundtruth boxes of the samples RoIs.
sampled_gt_classes: a tensor of shape of [batch_size, K], storing the
classes of the matched groundtruth boxes of the sampled RoIs.
sampled_gt_indices: a tensor of shape of [batch_size, K], storing the
indices of the sampled groudntruth boxes in the original `gt_boxes`
tensor, i.e. gt_boxes[sampled_gt_indices[:, i]] =
sampled_gt_boxes[:, i].
"""
with tf.name_scope('sample_proposals_and_scores'):
if mix_gt_boxes:
boxes = tf.concat([proposed_boxes, gt_boxes], axis=1)
gt_scores = tf.ones_like(gt_boxes[:, :, 0])
scores = tf.concat([proposed_scores, gt_scores], axis=1)
else:
boxes = proposed_boxes
scores = proposed_scores
(matched_gt_boxes, matched_gt_classes, matched_gt_indices, matched_iou,
_) = box_matching(boxes, gt_boxes, gt_classes)
positive_match = tf.greater(matched_iou, fg_iou_thresh)
negative_match = tf.logical_and(
tf.greater_equal(matched_iou, bg_iou_thresh_lo),
tf.less(matched_iou, bg_iou_thresh_hi))
ignored_match = tf.less(matched_iou, 0.0)
# re-assign negatively matched boxes to the background class.
matched_gt_classes = tf.where(negative_match,
tf.zeros_like(matched_gt_classes),
matched_gt_classes)
matched_gt_indices = tf.where(negative_match,
tf.zeros_like(matched_gt_indices),
matched_gt_indices)
sample_candidates = tf.logical_and(
tf.logical_or(positive_match, negative_match),
tf.logical_not(ignored_match))
sampler = (
balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
positive_fraction=fg_fraction, is_static=True))
batch_size, _ = sample_candidates.get_shape().as_list()
sampled_indicators = []
for i in range(batch_size):
sampled_indicator = sampler.subsample(sample_candidates[i],
num_samples_per_image,
positive_match[i])
sampled_indicators.append(sampled_indicator)
sampled_indicators = tf.stack(sampled_indicators)
_, sampled_indices = tf.nn.top_k(
tf.cast(sampled_indicators, dtype=tf.int32),
k=num_samples_per_image,
sorted=True)
sampled_indices_shape = tf.shape(sampled_indices)
batch_indices = (
tf.expand_dims(tf.range(sampled_indices_shape[0]), axis=-1) *
tf.ones([1, sampled_indices_shape[-1]], dtype=tf.int32))
gather_nd_indices = tf.stack([batch_indices, sampled_indices], axis=-1)
sampled_rois = tf.gather_nd(boxes, gather_nd_indices)
sampled_roi_scores = tf.gather_nd(scores, gather_nd_indices)
sampled_gt_boxes = tf.gather_nd(matched_gt_boxes, gather_nd_indices)
sampled_gt_classes = tf.gather_nd(matched_gt_classes, gather_nd_indices)
sampled_gt_indices = tf.gather_nd(matched_gt_indices, gather_nd_indices)
return (sampled_rois, sampled_roi_scores, sampled_gt_boxes,
sampled_gt_classes, sampled_gt_indices)
class MaskSampler(tf.keras.layers.Layer):
"""Samples and creates mask training targets."""
def __init__(self, mask_target_size, num_mask_samples_per_image):
self._mask_target_size = mask_target_size
self._num_mask_samples_per_image = num_mask_samples_per_image
super(MaskSampler, self).__init__(autocast=False)
def call(self,
candidate_rois,
candidate_gt_boxes,
candidate_gt_classes,
candidate_gt_indices,
gt_masks):
"""Sample and create mask targets for training.
Args:
candidate_rois: a tensor of shape of [batch_size, N, 4], where N is the
number of candidate RoIs to be considered for mask sampling. It includes
both positive and negative RoIs. The `num_mask_samples_per_image`
positive RoIs will be sampled to create mask training targets.
candidate_gt_boxes: a tensor of shape of [batch_size, N, 4], storing the
corresponding groundtruth boxes to the `candidate_rois`.
candidate_gt_classes: a tensor of shape of [batch_size, N], storing the
corresponding groundtruth classes to the `candidate_rois`. 0 in the
tensor corresponds to the background class, i.e. negative RoIs.
candidate_gt_indices: a tensor of shape [batch_size, N], storing the
corresponding groundtruth instance indices to the `candidate_gt_boxes`,
i.e. gt_boxes[candidate_gt_indices[:, i]] = candidate_gt_boxes[:, i],
where gt_boxes which is of shape [batch_size, MAX_INSTANCES, 4], M >=
N, is the superset of candidate_gt_boxes.
gt_masks: a tensor of [batch_size, MAX_INSTANCES, mask_height, mask_width]
containing all the groundtruth masks which sample masks are drawn from.
after sampling. The output masks are resized w.r.t the sampled RoIs.
Returns:
foreground_rois: a tensor of shape of [batch_size, K, 4] storing the RoI
that corresponds to the sampled foreground masks, where
K = num_mask_samples_per_image.
foreground_classes: a tensor of shape of [batch_size, K] storing the
classes corresponding to the sampled foreground masks.
cropoped_foreground_masks: a tensor of shape of
[batch_size, K, mask_target_size, mask_target_size] storing the
cropped foreground masks used for training.
"""
foreground_rois, foreground_classes, cropped_foreground_masks = (
sample_and_crop_foreground_masks(candidate_rois, candidate_gt_boxes,
candidate_gt_classes,
candidate_gt_indices, gt_masks,
self._num_mask_samples_per_image,
self._mask_target_size))
return foreground_rois, foreground_classes, cropped_foreground_masks
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for bounding box processing."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
EPSILON = 1e-8
BBOX_XFORM_CLIP = np.log(1000. / 16.)
def visualize_images_with_bounding_boxes(images, box_outputs, step,
summary_writer):
"""Records subset of evaluation images with bounding boxes."""
image_shape = tf.shape(images[0])
image_height = tf.cast(image_shape[0], tf.float32)
image_width = tf.cast(image_shape[1], tf.float32)
normalized_boxes = normalize_boxes(box_outputs, [image_height, image_width])
bounding_box_color = tf.constant([[1.0, 1.0, 0.0, 1.0]])
image_summary = tf.image.draw_bounding_boxes(images, normalized_boxes,
bounding_box_color)
with summary_writer.as_default():
tf.summary.image('bounding_box_summary', image_summary, step=step)
summary_writer.flush()
def yxyx_to_xywh(boxes):
"""Converts boxes from ymin, xmin, ymax, xmax to xmin, ymin, width, height.
Args:
boxes: a numpy array whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
Returns:
boxes: a numpy array whose shape is the same as `boxes` in new format.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
boxes_ymin = boxes[..., 0]
boxes_xmin = boxes[..., 1]
boxes_width = boxes[..., 3] - boxes[..., 1]
boxes_height = boxes[..., 2] - boxes[..., 0]
new_boxes = np.stack([boxes_xmin, boxes_ymin, boxes_width, boxes_height],
axis=-1)
return new_boxes
def jitter_boxes(boxes, noise_scale=0.025):
"""Jitter the box coordinates by some noise distribution.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
noise_scale: a python float which specifies the magnitude of noise. The rule
of thumb is to set this between (0, 0.1]. The default value is found to
mimic the noisy detections best empirically.
Returns:
jittered_boxes: a tensor whose shape is the same as `boxes` representing
the jittered boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('jitter_boxes'):
bbox_jitters = tf.random.normal(boxes.get_shape(), stddev=noise_scale)
ymin = boxes[..., 0:1]
xmin = boxes[..., 1:2]
ymax = boxes[..., 2:3]
xmax = boxes[..., 3:4]
width = xmax - xmin
height = ymax - ymin
new_center_x = (xmin + xmax) / 2.0 + bbox_jitters[..., 0:1] * width
new_center_y = (ymin + ymax) / 2.0 + bbox_jitters[..., 1:2] * height
new_width = width * tf.math.exp(bbox_jitters[..., 2:3])
new_height = height * tf.math.exp(bbox_jitters[..., 3:4])
jittered_boxes = tf.concat([
new_center_y - new_height * 0.5, new_center_x - new_width * 0.5,
new_center_y + new_height * 0.5, new_center_x + new_width * 0.5
],
axis=-1)
return jittered_boxes
def normalize_boxes(boxes, image_shape):
"""Converts boxes to the normalized coordinates.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
normalized_boxes: a tensor whose shape is the same as `boxes` representing
the normalized boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('normalize_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height = image_shape[..., 0:1]
width = image_shape[..., 1:2]
ymin = boxes[..., 0:1] / height
xmin = boxes[..., 1:2] / width
ymax = boxes[..., 2:3] / height
xmax = boxes[..., 3:4] / width
normalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
return normalized_boxes
def denormalize_boxes(boxes, image_shape):
"""Converts boxes normalized by [height, width] to pixel coordinates.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
denormalized_boxes: a tensor whose shape is the same as `boxes` representing
the denormalized boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
with tf.name_scope('denormalize_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width = tf.split(image_shape, 2, axis=-1)
ymin, xmin, ymax, xmax = tf.split(boxes, 4, axis=-1)
ymin = ymin * height
xmin = xmin * width
ymax = ymax * height
xmax = xmax * width
denormalized_boxes = tf.concat([ymin, xmin, ymax, xmax], axis=-1)
return denormalized_boxes
def clip_boxes(boxes, image_shape):
"""Clips boxes to image boundaries.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
Returns:
clipped_boxes: a tensor whose shape is the same as `boxes` representing the
clipped boxes.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('clip_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
max_length = [height - 1.0, width - 1.0, height - 1.0, width - 1.0]
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height, width = tf.unstack(image_shape, axis=-1)
max_length = tf.stack(
[height - 1.0, width - 1.0, height - 1.0, width - 1.0], axis=-1)
clipped_boxes = tf.math.maximum(tf.math.minimum(boxes, max_length), 0.0)
return clipped_boxes
def compute_outer_boxes(boxes, image_shape, scale=1.0):
"""Compute outer box encloses an object with a margin.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
image_shape: a list of two integers, a two-element vector or a tensor such
that all but the last dimensions are `broadcastable` to `boxes`. The last
dimension is 2, which represents [height, width].
scale: a float number specifying the scale of output outer boxes to input
`boxes`.
Returns:
outer_boxes: a tensor whose shape is the same as `boxes` representing the
outer boxes.
"""
if scale < 1.0:
raise ValueError(
'scale is {}, but outer box scale must be greater than 1.0.'.format(
scale))
centers_y = (boxes[..., 0] + boxes[..., 2]) / 2.0
centers_x = (boxes[..., 1] + boxes[..., 3]) / 2.0
box_height = (boxes[..., 2] - boxes[..., 0]) * scale
box_width = (boxes[..., 3] - boxes[..., 1]) * scale
outer_boxes = tf.stack([
centers_y - box_height / 2.0, centers_x - box_width / 2.0,
centers_y + box_height / 2.0, centers_x + box_width / 2.0
],
axis=1)
outer_boxes = clip_boxes(outer_boxes, image_shape)
return outer_boxes
def encode_boxes(boxes, anchors, weights=None):
"""Encode boxes to targets.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes: a tensor whose shape is the same as `boxes` representing the
encoded box targets.
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[-1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('encode_boxes'):
boxes = tf.cast(boxes, dtype=anchors.dtype)
ymin = boxes[..., 0:1]
xmin = boxes[..., 1:2]
ymax = boxes[..., 2:3]
xmax = boxes[..., 3:4]
box_h = ymax - ymin + 1.0
box_w = xmax - xmin + 1.0
box_yc = ymin + 0.5 * box_h
box_xc = xmin + 0.5 * box_w
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
anchor_h = anchor_ymax - anchor_ymin + 1.0
anchor_w = anchor_xmax - anchor_xmin + 1.0
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
encoded_dy = (box_yc - anchor_yc) / anchor_h
encoded_dx = (box_xc - anchor_xc) / anchor_w
encoded_dh = tf.math.log(box_h / anchor_h)
encoded_dw = tf.math.log(box_w / anchor_w)
if weights:
encoded_dy *= weights[0]
encoded_dx *= weights[1]
encoded_dh *= weights[2]
encoded_dw *= weights[3]
encoded_boxes = tf.concat([encoded_dy, encoded_dx, encoded_dh, encoded_dw],
axis=-1)
return encoded_boxes
def decode_boxes(encoded_boxes, anchors, weights=None):
"""Decode boxes.
Args:
encoded_boxes: a tensor whose last dimension is 4 representing the
coordinates of encoded boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes: a tensor whose shape is the same as `boxes` representing the
decoded box targets.
"""
if encoded_boxes.shape[-1] != 4:
raise ValueError('encoded_boxes.shape[-1] is {:d}, but must be 4.'.format(
encoded_boxes.shape[-1]))
with tf.name_scope('decode_boxes'):
encoded_boxes = tf.cast(encoded_boxes, dtype=anchors.dtype)
dy = encoded_boxes[..., 0:1]
dx = encoded_boxes[..., 1:2]
dh = encoded_boxes[..., 2:3]
dw = encoded_boxes[..., 3:4]
if weights:
dy /= weights[0]
dx /= weights[1]
dh /= weights[2]
dw /= weights[3]
dh = tf.math.minimum(dh, BBOX_XFORM_CLIP)
dw = tf.math.minimum(dw, BBOX_XFORM_CLIP)
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
anchor_h = anchor_ymax - anchor_ymin + 1.0
anchor_w = anchor_xmax - anchor_xmin + 1.0
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
decoded_boxes_yc = dy * anchor_h + anchor_yc
decoded_boxes_xc = dx * anchor_w + anchor_xc
decoded_boxes_h = tf.math.exp(dh) * anchor_h
decoded_boxes_w = tf.math.exp(dw) * anchor_w
decoded_boxes_ymin = decoded_boxes_yc - 0.5 * decoded_boxes_h
decoded_boxes_xmin = decoded_boxes_xc - 0.5 * decoded_boxes_w
decoded_boxes_ymax = decoded_boxes_ymin + decoded_boxes_h - 1.0
decoded_boxes_xmax = decoded_boxes_xmin + decoded_boxes_w - 1.0
decoded_boxes = tf.concat([
decoded_boxes_ymin, decoded_boxes_xmin, decoded_boxes_ymax,
decoded_boxes_xmax
],
axis=-1)
return decoded_boxes
def encode_boxes_lrtb(boxes, anchors, weights=None):
"""Encode boxes to targets on lrtb (=left,right,top,bottom) format.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates
of boxes in ymin, xmin, ymax, xmax order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
encoded_boxes_lrtb: a tensor whose shape is the same as `boxes` representing
the encoded box targets. The box targets encode the left, right, top,
bottom distances from an anchor location to the four borders of the
matched groundtruth bounding box.
center_targets: centerness targets defined by the left, right, top, and
bottom distance targets. The centerness is defined as the deviation of the
anchor location from the groundtruth object center. Formally, centerness =
sqrt(min(left, right)/max(left, right)*min(top, bottom)/max(top, bottom)).
Raises:
ValueError: If the last dimension of boxes is not 4.
"""
if boxes.shape[-1] != 4:
raise ValueError(
'boxes.shape[-1] is {:d}, but must be 4.'.format(boxes.shape[-1]))
with tf.name_scope('encode_boxes_lrtb'):
boxes = tf.cast(boxes, dtype=anchors.dtype)
ymin = boxes[..., 0:1]
xmin = boxes[..., 1:2]
ymax = boxes[..., 2:3]
xmax = boxes[..., 3:4]
# box_h = ymax - ymin + 1.0
# box_w = xmax - xmin + 1.0
box_h = ymax - ymin
box_w = xmax - xmin
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
# anchor_h = anchor_ymax - anchor_ymin + 1.0
# anchor_w = anchor_xmax - anchor_xmin + 1.0
anchor_h = anchor_ymax - anchor_ymin
anchor_w = anchor_xmax - anchor_xmin
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
box_h += EPSILON
box_w += EPSILON
anchor_h += EPSILON
anchor_w += EPSILON
left = (anchor_xc - xmin) / anchor_w
right = (xmax - anchor_xc) / anchor_w
top = (anchor_yc - ymin) / anchor_h
bottom = (ymax - anchor_yc) / anchor_h
# Create centerness target. {
lrtb_targets = tf.concat([left, right, top, bottom], axis=-1)
valid_match = tf.greater(tf.reduce_min(lrtb_targets, -1), 0.0)
# Centerness score.
left_right = tf.concat([left, right], axis=-1)
left_right = tf.where(tf.stack([valid_match, valid_match], -1),
left_right, tf.zeros_like(left_right))
top_bottom = tf.concat([top, bottom], axis=-1)
top_bottom = tf.where(tf.stack([valid_match, valid_match], -1),
top_bottom, tf.zeros_like(top_bottom))
center_targets = tf.sqrt(
(tf.reduce_min(left_right, -1) /
(tf.reduce_max(left_right, -1) + EPSILON)) *
(tf.reduce_min(top_bottom, -1) /
(tf.reduce_max(top_bottom, -1) + EPSILON)))
center_targets = tf.where(valid_match,
center_targets,
tf.zeros_like(center_targets))
if weights:
left *= weights[0]
right *= weights[1]
top *= weights[2]
bottom *= weights[3]
encoded_boxes_lrtb = tf.concat(
[left, right, top, bottom],
axis=-1)
return encoded_boxes_lrtb, center_targets
def decode_boxes_lrtb(encoded_boxes_lrtb, anchors, weights=None):
"""Decode boxes.
Args:
encoded_boxes_lrtb: a tensor whose last dimension is 4 representing the
coordinates of encoded boxes in left, right, top, bottom order.
anchors: a tensor whose shape is the same as, or `broadcastable` to `boxes`,
representing the coordinates of anchors in ymin, xmin, ymax, xmax order.
weights: None or a list of four float numbers used to scale coordinates.
Returns:
decoded_boxes_lrtb: a tensor whose shape is the same as `boxes` representing
the decoded box targets in lrtb (=left,right,top,bottom) format. The box
decoded box coordinates represent the left, right, top, and bottom
distances from an anchor location to the four borders of the matched
groundtruth bounding box.
"""
if encoded_boxes_lrtb.shape[-1] != 4:
raise ValueError(
'encoded_boxes_lrtb.shape[-1] is {:d}, but must be 4.'
.format(encoded_boxes_lrtb.shape[-1]))
with tf.name_scope('decode_boxes_lrtb'):
encoded_boxes_lrtb = tf.cast(encoded_boxes_lrtb, dtype=anchors.dtype)
left = encoded_boxes_lrtb[..., 0:1]
right = encoded_boxes_lrtb[..., 1:2]
top = encoded_boxes_lrtb[..., 2:3]
bottom = encoded_boxes_lrtb[..., 3:4]
if weights:
left /= weights[0]
right /= weights[1]
top /= weights[2]
bottom /= weights[3]
anchor_ymin = anchors[..., 0:1]
anchor_xmin = anchors[..., 1:2]
anchor_ymax = anchors[..., 2:3]
anchor_xmax = anchors[..., 3:4]
anchor_h = anchor_ymax - anchor_ymin
anchor_w = anchor_xmax - anchor_xmin
anchor_yc = anchor_ymin + 0.5 * anchor_h
anchor_xc = anchor_xmin + 0.5 * anchor_w
anchor_h += EPSILON
anchor_w += EPSILON
decoded_boxes_ymin = anchor_yc - top * anchor_h
decoded_boxes_xmin = anchor_xc - left * anchor_w
decoded_boxes_ymax = anchor_yc + bottom * anchor_h
decoded_boxes_xmax = anchor_xc + right * anchor_w
decoded_boxes_lrtb = tf.concat(
[decoded_boxes_ymin, decoded_boxes_xmin,
decoded_boxes_ymax, decoded_boxes_xmax],
axis=-1)
return decoded_boxes_lrtb
def filter_boxes(boxes, scores, image_shape, min_size_threshold):
"""Filter and remove boxes that are too small or fall outside the image.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
representing the original scores of the boxes.
image_shape: a tensor whose shape is the same as, or `broadcastable` to
`boxes` except the last dimension, which is 2, representing [height,
width] of the scaled image.
min_size_threshold: a float representing the minimal box size in each side
(w.r.t. the scaled image). Boxes whose sides are smaller than it will be
filtered out.
Returns:
filtered_boxes: a tensor whose shape is the same as `boxes` but with
the position of the filtered boxes are filled with 0.
filtered_scores: a tensor whose shape is the same as 'scores' but with
the positinon of the filtered boxes filled with 0.
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('filter_boxes'):
if isinstance(image_shape, list) or isinstance(image_shape, tuple):
height, width = image_shape
else:
image_shape = tf.cast(image_shape, dtype=boxes.dtype)
height = image_shape[..., 0]
width = image_shape[..., 1]
ymin = boxes[..., 0]
xmin = boxes[..., 1]
ymax = boxes[..., 2]
xmax = boxes[..., 3]
h = ymax - ymin + 1.0
w = xmax - xmin + 1.0
yc = ymin + 0.5 * h
xc = xmin + 0.5 * w
min_size = tf.cast(
tf.math.maximum(min_size_threshold, 1.0), dtype=boxes.dtype)
filtered_size_mask = tf.math.logical_and(
tf.math.greater(h, min_size), tf.math.greater(w, min_size))
filtered_center_mask = tf.logical_and(
tf.math.logical_and(tf.math.greater(yc, 0.0), tf.math.less(yc, height)),
tf.math.logical_and(tf.math.greater(xc, 0.0), tf.math.less(xc, width)))
filtered_mask = tf.math.logical_and(filtered_size_mask,
filtered_center_mask)
filtered_scores = tf.where(filtered_mask, scores, tf.zeros_like(scores))
filtered_boxes = tf.cast(
tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
return filtered_boxes, filtered_scores
def filter_boxes_by_scores(boxes, scores, min_score_threshold):
"""Filter and remove boxes whose scores are smaller than the threshold.
Args:
boxes: a tensor whose last dimension is 4 representing the coordinates of
boxes in ymin, xmin, ymax, xmax order.
scores: a tensor whose shape is the same as tf.shape(boxes)[:-1]
representing the original scores of the boxes.
min_score_threshold: a float representing the minimal box score threshold.
Boxes whose score are smaller than it will be filtered out.
Returns:
filtered_boxes: a tensor whose shape is the same as `boxes` but with
the position of the filtered boxes are filled with -1.
filtered_scores: a tensor whose shape is the same as 'scores' but with
the
"""
if boxes.shape[-1] != 4:
raise ValueError('boxes.shape[1] is {:d}, but must be 4.'.format(
boxes.shape[-1]))
with tf.name_scope('filter_boxes_by_scores'):
filtered_mask = tf.math.greater(scores, min_score_threshold)
filtered_scores = tf.where(filtered_mask, scores, -tf.ones_like(scores))
filtered_boxes = tf.cast(
tf.expand_dims(filtered_mask, axis=-1), dtype=boxes.dtype) * boxes
return filtered_boxes, filtered_scores
def top_k_boxes(boxes, scores, k):
"""Sort and select top k boxes according to the scores.
Args:
boxes: a tensor of shape [batch_size, N, 4] representing the coordiante of
the boxes. N is the number of boxes per image.
scores: a tensor of shsape [batch_size, N] representing the socre of the
boxes.
k: an integer or a tensor indicating the top k number.
Returns:
selected_boxes: a tensor of shape [batch_size, k, 4] representing the
selected top k box coordinates.
selected_scores: a tensor of shape [batch_size, k] representing the selected
top k box scores.
"""
with tf.name_scope('top_k_boxes'):
selected_scores, top_k_indices = tf.nn.top_k(scores, k=k, sorted=True)
batch_size, _ = scores.get_shape().as_list()
if batch_size == 1:
selected_boxes = tf.squeeze(
tf.gather(boxes, top_k_indices, axis=1), axis=1)
else:
top_k_indices_shape = tf.shape(top_k_indices)
batch_indices = (
tf.expand_dims(tf.range(top_k_indices_shape[0]), axis=-1) *
tf.ones([1, top_k_indices_shape[-1]], dtype=tf.int32))
gather_nd_indices = tf.stack([batch_indices, top_k_indices], axis=-1)
selected_boxes = tf.gather_nd(boxes, gather_nd_indices)
return selected_boxes, selected_scores
def bbox_overlap(boxes, gt_boxes):
"""Calculates the overlap between proposal and ground truth boxes.
Some `gt_boxes` may have been padded. The returned `iou` tensor for these
boxes will be -1.
Args:
boxes: a tensor with a shape of [batch_size, N, 4]. N is the number of
proposals before groundtruth assignment (e.g., rpn_post_nms_topn). The
last dimension is the pixel coordinates in [ymin, xmin, ymax, xmax] form.
gt_boxes: a tensor with a shape of [batch_size, MAX_NUM_INSTANCES, 4]. This
tensor might have paddings with a negative value.
Returns:
iou: a tensor with as a shape of [batch_size, N, MAX_NUM_INSTANCES].
"""
with tf.name_scope('bbox_overlap'):
bb_y_min, bb_x_min, bb_y_max, bb_x_max = tf.split(
value=boxes, num_or_size_splits=4, axis=2)
gt_y_min, gt_x_min, gt_y_max, gt_x_max = tf.split(
value=gt_boxes, num_or_size_splits=4, axis=2)
# Calculates the intersection area.
i_xmin = tf.math.maximum(bb_x_min, tf.transpose(gt_x_min, [0, 2, 1]))
i_xmax = tf.math.minimum(bb_x_max, tf.transpose(gt_x_max, [0, 2, 1]))
i_ymin = tf.math.maximum(bb_y_min, tf.transpose(gt_y_min, [0, 2, 1]))
i_ymax = tf.math.minimum(bb_y_max, tf.transpose(gt_y_max, [0, 2, 1]))
i_area = tf.math.maximum((i_xmax - i_xmin), 0) * tf.math.maximum(
(i_ymax - i_ymin), 0)
# Calculates the union area.
bb_area = (bb_y_max - bb_y_min) * (bb_x_max - bb_x_min)
gt_area = (gt_y_max - gt_y_min) * (gt_x_max - gt_x_min)
# Adds a small epsilon to avoid divide-by-zero.
u_area = bb_area + tf.transpose(gt_area, [0, 2, 1]) - i_area + 1e-8
# Calculates IoU.
iou = i_area / u_area
# Fills -1 for IoU entries between the padded ground truth boxes.
gt_invalid_mask = tf.less(
tf.reduce_max(gt_boxes, axis=-1, keepdims=True), 0.0)
padding_mask = tf.logical_or(
tf.zeros_like(bb_x_min, dtype=tf.bool),
tf.transpose(gt_invalid_mask, [0, 2, 1]))
iou = tf.where(padding_mask, -tf.ones_like(iou), iou)
return iou
def get_non_empty_box_indices(boxes):
"""Get indices for non-empty boxes."""
# Selects indices if box height or width is 0.
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
indices = tf.where(
tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)))
return indices[:, 0]
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for handling dataset object categories."""
def coco_split_class_ids(split_name):
"""Return the COCO class split ids based on split name and training mode.
Args:
split_name: The name of dataset split.
Returns:
class_ids: a python list of integer.
"""
if split_name == 'all':
return []
elif split_name == 'voc':
return [
1, 2, 3, 4, 5, 6, 7, 9, 16, 17, 18, 19, 20, 21, 44, 62, 63, 64, 67, 72
]
elif split_name == 'nonvoc':
return [
8, 10, 11, 13, 14, 15, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36,
37, 38, 39, 40, 41, 42, 43, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
57, 58, 59, 60, 61, 65, 70, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84,
85, 86, 87, 88, 89, 90
]
else:
raise ValueError('Invalid split name {}!!!'.format(split_name))
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for dataloader."""
import tensorflow as tf
from official.legacy.detection.utils import input_utils
def process_source_id(source_id):
"""Processes source_id to the right format."""
if source_id.dtype == tf.string:
source_id = tf.cast(tf.strings.to_number(source_id), tf.int64)
with tf.control_dependencies([source_id]):
source_id = tf.cond(
pred=tf.equal(tf.size(input=source_id), 0),
true_fn=lambda: tf.cast(tf.constant(-1), tf.int64),
false_fn=lambda: tf.identity(source_id))
return source_id
def pad_groundtruths_to_fixed_size(gt, n):
"""Pads the first dimension of groundtruths labels to the fixed size."""
gt['boxes'] = input_utils.pad_to_fixed_size(gt['boxes'], n, -1)
gt['is_crowds'] = input_utils.pad_to_fixed_size(gt['is_crowds'], n, 0)
gt['areas'] = input_utils.pad_to_fixed_size(gt['areas'], n, -1)
gt['classes'] = input_utils.pad_to_fixed_size(gt['classes'], n, -1)
return gt
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for input processing."""
import math
import tensorflow as tf
from official.legacy.detection.utils import box_utils
from official.vision.utils.object_detection import preprocessor
def pad_to_fixed_size(input_tensor, size, constant_values=0):
"""Pads data to a fixed length at the first dimension.
Args:
input_tensor: `Tensor` with any dimension.
size: `int` number for the first dimension of output Tensor.
constant_values: `int` value assigned to the paddings.
Returns:
`Tensor` with the first dimension padded to `size`.
"""
input_shape = input_tensor.get_shape().as_list()
padding_shape = []
# Computes the padding length on the first dimension.
padding_length = tf.maximum(0, size - tf.shape(input_tensor)[0])
assert_length = tf.Assert(
tf.greater_equal(padding_length, 0), [padding_length])
with tf.control_dependencies([assert_length]):
padding_shape.append(padding_length)
# Copies shapes of the rest of input shape dimensions.
for i in range(1, len(input_shape)):
padding_shape.append(tf.shape(input=input_tensor)[i])
# Pads input tensor to the fixed first dimension.
paddings = tf.cast(constant_values * tf.ones(padding_shape),
input_tensor.dtype)
padded_tensor = tf.concat([input_tensor, paddings], axis=0)
output_shape = input_shape
output_shape[0] = size
padded_tensor.set_shape(output_shape)
return padded_tensor
def normalize_image(image,
offset=(0.485, 0.456, 0.406),
scale=(0.229, 0.224, 0.225)):
"""Normalizes the image to zero mean and unit variance."""
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
offset = tf.constant(offset)
offset = tf.expand_dims(offset, axis=0)
offset = tf.expand_dims(offset, axis=0)
image -= offset
scale = tf.constant(scale)
scale = tf.expand_dims(scale, axis=0)
scale = tf.expand_dims(scale, axis=0)
image /= scale
return image
def compute_padded_size(desired_size, stride):
"""Compute the padded size given the desired size and the stride.
The padded size will be the smallest rectangle, such that each dimension is
the smallest multiple of the stride which is larger than the desired
dimension. For example, if desired_size = (100, 200) and stride = 32,
the output padded_size = (128, 224).
Args:
desired_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the target output image size.
stride: an integer, the stride of the backbone network.
Returns:
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size.
"""
if isinstance(desired_size, list) or isinstance(desired_size, tuple):
padded_size = [
int(math.ceil(d * 1.0 / stride) * stride) for d in desired_size
]
else:
padded_size = tf.cast(
tf.math.ceil(tf.cast(desired_size, dtype=tf.float32) / stride) * stride,
tf.int32)
return padded_size
def resize_and_crop_image(image,
desired_size,
padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0,
seed=1,
method=tf.image.ResizeMethod.BILINEAR):
"""Resizes the input image to output size.
Resize and pad images given the desired output size of the image and
stride size.
Here are the preprocessing steps.
1. For a given image, keep its aspect ratio and rescale the image to make it
the largest rectangle to be bounded by the rectangle specified by the
`desired_size`.
2. Pad the rescaled image to the padded_size.
Args:
image: a `Tensor` of shape [height, width, 3] representing an image.
desired_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the desired actual output image size.
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size. Padding will be applied
after scaling the image to the desired_size.
aug_scale_min: a `float` with range between [0, 1.0] representing minimum
random scale applied to desired_size for training scale jittering.
aug_scale_max: a `float` with range between [1.0, inf] representing maximum
random scale applied to desired_size for training scale jittering.
seed: seed for random scale jittering.
method: function to resize input image to scaled image.
Returns:
output_image: `Tensor` of shape [height, width, 3] where [height, width]
equals to `output_size`.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [desired_height, desired_width],
[y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
desireed_width] is the actual scaled image size, and [y_scale, x_scale] is
the scaling factory, which is the ratio of
scaled dimension / original dimension.
"""
with tf.name_scope('resize_and_crop_image'):
image_size = tf.cast(tf.shape(input=image)[0:2], tf.float32)
random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
if random_jittering:
random_scale = tf.random.uniform([],
aug_scale_min,
aug_scale_max,
seed=seed)
scaled_size = tf.round(random_scale * desired_size)
else:
scaled_size = desired_size
scale = tf.minimum(scaled_size[0] / image_size[0],
scaled_size[1] / image_size[1])
scaled_size = tf.round(image_size * scale)
# Computes 2D image_scale.
image_scale = scaled_size / image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if random_jittering:
max_offset = scaled_size - desired_size
max_offset = tf.where(
tf.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
offset = max_offset * tf.random.uniform([
2,
], 0, 1, seed=seed)
offset = tf.cast(offset, tf.int32)
else:
offset = tf.zeros((2,), tf.int32)
scaled_image = tf.image.resize(
image, tf.cast(scaled_size, tf.int32), method=method)
if random_jittering:
scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0],
offset[1]:offset[1] + desired_size[1], :]
output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
padded_size[0], padded_size[1])
image_info = tf.stack([
image_size,
tf.cast(desired_size, dtype=tf.float32), image_scale,
tf.cast(offset, tf.float32)
])
return output_image, image_info
def resize_and_crop_image_v2(image,
short_side,
long_side,
padded_size,
aug_scale_min=1.0,
aug_scale_max=1.0,
seed=1,
method=tf.image.ResizeMethod.BILINEAR):
"""Resizes the input image to output size (Faster R-CNN style).
Resize and pad images given the specified short / long side length and the
stride size.
Here are the preprocessing steps.
1. For a given image, keep its aspect ratio and first try to rescale the short
side of the original image to `short_side`.
2. If the scaled image after 1 has a long side that exceeds `long_side`, keep
the aspect ratio and rescal the long side of the image to `long_side`.
2. Pad the rescaled image to the padded_size.
Args:
image: a `Tensor` of shape [height, width, 3] representing an image.
short_side: a scalar `Tensor` or `int` representing the desired short side
to be rescaled to.
long_side: a scalar `Tensor` or `int` representing the desired long side to
be rescaled to.
padded_size: a `Tensor` or `int` list/tuple of two elements representing
[height, width] of the padded output image size. Padding will be applied
after scaling the image to the desired_size.
aug_scale_min: a `float` with range between [0, 1.0] representing minimum
random scale applied to desired_size for training scale jittering.
aug_scale_max: a `float` with range between [1.0, inf] representing maximum
random scale applied to desired_size for training scale jittering.
seed: seed for random scale jittering.
method: function to resize input image to scaled image.
Returns:
output_image: `Tensor` of shape [height, width, 3] where [height, width]
equals to `output_size`.
image_info: a 2D `Tensor` that encodes the information of the image and the
applied preprocessing. It is in the format of
[[original_height, original_width], [desired_height, desired_width],
[y_scale, x_scale], [y_offset, x_offset]], where [desired_height,
desired_width] is the actual scaled image size, and [y_scale, x_scale] is
the scaling factor, which is the ratio of
scaled dimension / original dimension.
"""
with tf.name_scope('resize_and_crop_image_v2'):
image_size = tf.cast(tf.shape(image)[0:2], tf.float32)
scale_using_short_side = (
short_side / tf.math.minimum(image_size[0], image_size[1]))
scale_using_long_side = (
long_side / tf.math.maximum(image_size[0], image_size[1]))
scaled_size = tf.math.round(image_size * scale_using_short_side)
scaled_size = tf.where(
tf.math.greater(
tf.math.maximum(scaled_size[0], scaled_size[1]), long_side),
tf.math.round(image_size * scale_using_long_side), scaled_size)
desired_size = scaled_size
random_jittering = (aug_scale_min != 1.0 or aug_scale_max != 1.0)
if random_jittering:
random_scale = tf.random.uniform([],
aug_scale_min,
aug_scale_max,
seed=seed)
scaled_size = tf.math.round(random_scale * scaled_size)
# Computes 2D image_scale.
image_scale = scaled_size / image_size
# Selects non-zero random offset (x, y) if scaled image is larger than
# desired_size.
if random_jittering:
max_offset = scaled_size - desired_size
max_offset = tf.where(
tf.math.less(max_offset, 0), tf.zeros_like(max_offset), max_offset)
offset = max_offset * tf.random.uniform([
2,
], 0, 1, seed=seed)
offset = tf.cast(offset, tf.int32)
else:
offset = tf.zeros((2,), tf.int32)
scaled_image = tf.image.resize(
image, tf.cast(scaled_size, tf.int32), method=method)
if random_jittering:
scaled_image = scaled_image[offset[0]:offset[0] + desired_size[0],
offset[1]:offset[1] + desired_size[1], :]
output_image = tf.image.pad_to_bounding_box(scaled_image, 0, 0,
padded_size[0], padded_size[1])
image_info = tf.stack([
image_size,
tf.cast(desired_size, dtype=tf.float32), image_scale,
tf.cast(offset, tf.float32)
])
return output_image, image_info
def resize_and_crop_boxes(boxes, image_scale, output_size, offset):
"""Resizes boxes to output size with scale and offset.
Args:
boxes: `Tensor` of shape [N, 4] representing ground truth boxes.
image_scale: 2D float `Tensor` representing scale factors that apply to
[height, width] of input image.
output_size: 2D `Tensor` or `int` representing [height, width] of target
output image size.
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
boxes.
Returns:
boxes: `Tensor` of shape [N, 4] representing the scaled boxes.
"""
# Adjusts box coordinates based on image_scale and offset.
boxes *= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
boxes -= tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
# Clips the boxes.
boxes = box_utils.clip_boxes(boxes, output_size)
return boxes
def resize_and_crop_masks(masks, image_scale, output_size, offset):
"""Resizes boxes to output size with scale and offset.
Args:
masks: `Tensor` of shape [N, H, W, 1] representing ground truth masks.
image_scale: 2D float `Tensor` representing scale factors that apply to
[height, width] of input image.
output_size: 2D `Tensor` or `int` representing [height, width] of target
output image size.
offset: 2D `Tensor` representing top-left corner [y0, x0] to crop scaled
boxes.
Returns:
masks: `Tensor` of shape [N, H, W, 1] representing the scaled masks.
"""
mask_size = tf.shape(input=masks)[1:3]
scaled_size = tf.cast(image_scale * tf.cast(mask_size, image_scale.dtype),
tf.int32)
scaled_masks = tf.image.resize(
masks, scaled_size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
offset = tf.cast(offset, tf.int32)
scaled_masks = scaled_masks[:, offset[0]:offset[0] + output_size[0],
offset[1]:offset[1] + output_size[1], :]
output_masks = tf.image.pad_to_bounding_box(scaled_masks, 0, 0,
output_size[0], output_size[1])
return output_masks
def random_horizontal_flip(image, boxes=None, masks=None):
"""Randomly flips input image and bounding boxes."""
return preprocessor.random_horizontal_flip(image, boxes, masks)
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for segmentations."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import cv2
import numpy as np
def paste_instance_masks(masks, detected_boxes, image_height, image_width):
"""Paste instance masks to generate the image segmentation results.
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
def expand_boxes(boxes, scale):
"""Expands an array of boxes by a given scale."""
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227 # pylint: disable=line-too-long
# The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
# whereas `boxes` here is in [x1, y1, w, h] form
w_half = boxes[:, 2] * .5
h_half = boxes[:, 3] * .5
x_c = boxes[:, 0] + w_half
y_c = boxes[:, 1] + h_half
w_half *= scale
h_half *= scale
boxes_exp = np.zeros(boxes.shape)
boxes_exp[:, 0] = x_c - w_half
boxes_exp[:, 2] = x_c + w_half
boxes_exp[:, 1] = y_c - h_half
boxes_exp[:, 3] = y_c + h_half
return boxes_exp
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812 # pylint: disable=line-too-long
# To work around an issue with cv2.resize (it seems to automatically pad
# with repeated border values), we manually zero-pad the masks by 1 pixel
# prior to resizing back to the original image resolution. This prevents
# "top hat" artifacts. We therefore need to expand the reference boxes by an
# appropriate factor.
_, mask_height, mask_width = masks.shape
scale = max((mask_width + 2.0) / mask_width,
(mask_height + 2.0) / mask_height)
ref_boxes = expand_boxes(detected_boxes, scale)
ref_boxes = ref_boxes.astype(np.int32)
padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
segms = []
for mask_ind, mask in enumerate(masks):
im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
# Process mask inside bounding boxes.
padded_mask[1:-1, 1:-1] = mask[:, :]
ref_box = ref_boxes[mask_ind, :]
w = ref_box[2] - ref_box[0] + 1
h = ref_box[3] - ref_box[1] + 1
w = np.maximum(w, 1)
h = np.maximum(h, 1)
mask = cv2.resize(padded_mask, (w, h))
mask = np.array(mask > 0.5, dtype=np.uint8)
x_0 = min(max(ref_box[0], 0), image_width)
x_1 = min(max(ref_box[2] + 1, 0), image_width)
y_0 = min(max(ref_box[1], 0), image_height)
y_1 = min(max(ref_box[3] + 1, 0), image_height)
im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]),
(x_0 - ref_box[0]):(x_1 - ref_box[0])]
segms.append(im_mask)
segms = np.array(segms)
assert masks.shape[0] == segms.shape[0]
return segms
def paste_instance_masks_v2(masks, detected_boxes, image_height, image_width):
"""Paste instance masks to generate the image segmentation (v2).
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
_, mask_height, mask_width = masks.shape
segms = []
for i, mask in enumerate(masks):
box = detected_boxes[i, :]
xmin = box[0]
ymin = box[1]
xmax = xmin + box[2]
ymax = ymin + box[3]
# Sample points of the cropped mask w.r.t. the image grid.
# Note that these coordinates may fall beyond the image.
# Pixel clipping will happen after warping.
xmin_int = int(math.floor(xmin))
xmax_int = int(math.ceil(xmax))
ymin_int = int(math.floor(ymin))
ymax_int = int(math.ceil(ymax))
alpha = box[2] / (1.0 * mask_width)
beta = box[3] / (1.0 * mask_height)
# pylint: disable=invalid-name
# Transformation from mask pixel indices to image coordinate.
M_mask_to_image = np.array([[alpha, 0, xmin], [0, beta, ymin], [0, 0, 1]],
dtype=np.float32)
# Transformation from image to cropped mask coordinate.
M_image_to_crop = np.array(
[[1, 0, -xmin_int], [0, 1, -ymin_int], [0, 0, 1]], dtype=np.float32)
M = np.dot(M_image_to_crop, M_mask_to_image)
# Compensate the half pixel offset that OpenCV has in the
# warpPerspective implementation: the top-left pixel is sampled
# at (0,0), but we want it to be at (0.5, 0.5).
M = np.dot(
np.dot(
np.array([[1, 0, -0.5], [0, 1, -0.5], [0, 0, 1]], np.float32), M),
np.array([[1, 0, 0.5], [0, 1, 0.5], [0, 0, 1]], np.float32))
# pylint: enable=invalid-name
cropped_mask = cv2.warpPerspective(
mask.astype(np.float32), M, (xmax_int - xmin_int, ymax_int - ymin_int))
cropped_mask = np.array(cropped_mask > 0.5, dtype=np.uint8)
img_mask = np.zeros((image_height, image_width))
x0 = max(min(xmin_int, image_width), 0)
x1 = max(min(xmax_int, image_width), 0)
y0 = max(min(ymin_int, image_height), 0)
y1 = max(min(ymax_int, image_height), 0)
img_mask[y0:y1, x0:x1] = cropped_mask[(y0 - ymin_int):(y1 - ymin_int),
(x0 - xmin_int):(x1 - xmin_int)]
segms.append(img_mask)
segms = np.array(segms)
return segms
# Image Classification
**Warning:** the features in the `image_classification/` directory have been
fully integrated into the [new code base](https://github.com/tensorflow/models/tree/benchmark/official/vision/modeling/backbones).
This folder contains TF 2 model examples for image classification:
* [MNIST](#mnist)
* [Classifier Trainer](#classifier-trainer), a framework that uses the Keras
compile/fit methods for image classification models, including:
* ResNet
* EfficientNet[^1]
[^1]: Currently a work in progress. We cannot match "AutoAugment (AA)" in [the original version](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet).
For more information about other types of models, please refer to this
[README file](../../README.md).
## Before you begin
Please make sure that you have the latest version of TensorFlow
installed and add the models folder to your Python path.
### ImageNet preparation
#### Using TFDS
`classifier_trainer.py` supports ImageNet with
[TensorFlow Datasets (TFDS)](https://www.tensorflow.org/datasets/overview).
Please see the following [example snippet](https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/scripts/download_and_prepare.py)
for more information on how to use TFDS to download and prepare datasets, and
specifically the [TFDS ImageNet readme](https://github.com/tensorflow/datasets/blob/master/docs/catalog/imagenet2012.md)
for manual download instructions.
#### Legacy TFRecords
Download the ImageNet dataset and convert it to TFRecord format.
The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
provide a few options.
Note that the legacy ResNet runners, e.g. [resnet/resnet_ctl_imagenet_main.py](resnet/resnet_ctl_imagenet_main.py)
require TFRecords whereas `classifier_trainer.py` can use both by setting the
builder to 'records' or 'tfds' in the configurations.
### Running on Cloud TPUs
Note: These models will **not** work with TPUs on Colab.
You can train image classification models on Cloud TPUs using
[tf.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf.distribute.TPUStrategy?version=nightly).
If you are not familiar with Cloud TPUs, it is strongly recommended that you go
through the
[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
create a TPU and GCE VM.
### Running on multiple GPU hosts
You can also train these models on multiple hosts, each with GPUs, using
[tf.distribute.experimental.MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy).
The easiest way to run multi-host benchmarks is to set the
[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
appropriately at each host. e.g., to run using `MultiWorkerMirroredStrategy` on
2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
"index": i}`. `MultiWorkerMirroredStrategy` will automatically use all the
available GPUs at each host.
## MNIST
To download the data and run the MNIST sample model locally for the first time,
run one of the following command:
<details>
```bash
python3 mnist_main.py \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--train_epochs=10 \
--distribution_strategy=one_device \
--num_gpus=$NUM_GPUS \
--download
```
</details>
To train the model on a Cloud TPU, run the following command:
<details>
```bash
python3 mnist_main.py \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--train_epochs=10 \
--distribution_strategy=tpu \
--download
```
</details>
Note: the `--download` flag is only required the first time you run the model.
## Classifier Trainer
The classifier trainer is a unified framework for running image classification
models using Keras's compile/fit methods. Experiments should be provided in the
form of YAML files, some examples are included within the configs/examples
folder. Please see [configs/examples](./configs/examples) for more example
configurations.
The provided configuration files use a per replica batch size and is scaled
by the number of devices. For instance, if `batch size` = 64, then for 1 GPU
the global batch size would be 64 * 1 = 64. For 8 GPUs, the global batch size
would be 64 * 8 = 512. Similarly, for a v3-8 TPU, the global batch size would
be 64 * 8 = 512, and for a v3-32, the global batch size is 64 * 32 = 2048.
### ResNet50
#### On GPU:
<details>
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=resnet \
--dataset=imagenet \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/resnet/imagenet/gpu.yaml \
--params_override='runtime.num_gpus=$NUM_GPUS'
```
</details>
To train on multiple hosts, each with GPUs attached using
[MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy)
please update `runtime` section in gpu.yaml
(or override using `--params_override`) with:
<details>
```YAML
# gpu.yaml
runtime:
distribution_strategy: 'multi_worker_mirrored'
worker_hosts: '$HOST1:port,$HOST2:port'
num_gpus: $NUM_GPUS
task_index: 0
```
</details>
By having `task_index: 0` on the first host and `task_index: 1` on the second
and so on. `$HOST1` and `$HOST2` are the IP addresses of the hosts, and `port`
can be chosen any free port on the hosts. Only the first host will write
TensorBoard Summaries and save checkpoints.
#### On TPU:
<details>
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=resnet \
--dataset=imagenet \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/resnet/imagenet/tpu.yaml
```
</details>
### VGG-16
#### On GPU:
<details>
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=vgg \
--dataset=imagenet \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/vgg/imagenet/gpu.yaml \
--params_override='runtime.num_gpus=$NUM_GPUS'
```
</details>
### EfficientNet
**Note: EfficientNet development is a work in progress.**
#### On GPU:
<details>
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=efficientnet \
--dataset=imagenet \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml \
--params_override='runtime.num_gpus=$NUM_GPUS'
```
</details>
#### On TPU:
<details>
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=efficientnet \
--dataset=imagenet \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
```
</details>
Note that the number of GPU devices can be overridden in the command line using
`--params_overrides`. The TPU does not need this override as the device is fixed
by providing the TPU address or name with the `--tpu` flag.
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""AutoAugment and RandAugment policies for enhanced image preprocessing.
AutoAugment Reference: https://arxiv.org/abs/1805.09501
RandAugment Reference: https://arxiv.org/abs/1909.13719
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
from typing import Any, Dict, List, Optional, Text, Tuple
import tensorflow as tf
# This signifies the max integer that the controller RNN could predict for the
# augmentation scheme.
_MAX_LEVEL = 10.
def to_4d(image: tf.Tensor) -> tf.Tensor:
"""Converts an input Tensor to 4 dimensions.
4D image => [N, H, W, C] or [N, C, H, W]
3D image => [1, H, W, C] or [1, C, H, W]
2D image => [1, H, W, 1]
Args:
image: The 2/3/4D input tensor.
Returns:
A 4D image tensor.
Raises:
`TypeError` if `image` is not a 2/3/4D tensor.
"""
shape = tf.shape(image)
original_rank = tf.rank(image)
left_pad = tf.cast(tf.less_equal(original_rank, 3), dtype=tf.int32)
right_pad = tf.cast(tf.equal(original_rank, 2), dtype=tf.int32)
new_shape = tf.concat(
[
tf.ones(shape=left_pad, dtype=tf.int32),
shape,
tf.ones(shape=right_pad, dtype=tf.int32),
],
axis=0,
)
return tf.reshape(image, new_shape)
def from_4d(image: tf.Tensor, ndims: tf.Tensor) -> tf.Tensor:
"""Converts a 4D image back to `ndims` rank."""
shape = tf.shape(image)
begin = tf.cast(tf.less_equal(ndims, 3), dtype=tf.int32)
end = 4 - tf.cast(tf.equal(ndims, 2), dtype=tf.int32)
new_shape = shape[begin:end]
return tf.reshape(image, new_shape)
def _convert_translation_to_transform(translations: tf.Tensor) -> tf.Tensor:
"""Converts translations to a projective transform.
The translation matrix looks like this:
[[1 0 -dx]
[0 1 -dy]
[0 0 1]]
Args:
translations: The 2-element list representing [dx, dy], or a matrix of
2-element lists representing [dx dy] to translate for each image. The
shape must be static.
Returns:
The transformation matrix of shape (num_images, 8).
Raises:
`TypeError` if
- the shape of `translations` is not known or
- the shape of `translations` is not rank 1 or 2.
"""
translations = tf.convert_to_tensor(translations, dtype=tf.float32)
if translations.get_shape().ndims is None:
raise TypeError('translations rank must be statically known')
elif len(translations.get_shape()) == 1:
translations = translations[None]
elif len(translations.get_shape()) != 2:
raise TypeError('translations should have rank 1 or 2.')
num_translations = tf.shape(translations)[0]
return tf.concat(
values=[
tf.ones((num_translations, 1), tf.dtypes.float32),
tf.zeros((num_translations, 1), tf.dtypes.float32),
-translations[:, 0, None],
tf.zeros((num_translations, 1), tf.dtypes.float32),
tf.ones((num_translations, 1), tf.dtypes.float32),
-translations[:, 1, None],
tf.zeros((num_translations, 2), tf.dtypes.float32),
],
axis=1,
)
def _convert_angles_to_transform(angles: tf.Tensor, image_width: tf.Tensor,
image_height: tf.Tensor) -> tf.Tensor:
"""Converts an angle or angles to a projective transform.
Args:
angles: A scalar to rotate all images, or a vector to rotate a batch of
images. This must be a scalar.
image_width: The width of the image(s) to be transformed.
image_height: The height of the image(s) to be transformed.
Returns:
A tensor of shape (num_images, 8).
Raises:
`TypeError` if `angles` is not rank 0 or 1.
"""
angles = tf.convert_to_tensor(angles, dtype=tf.float32)
if len(angles.get_shape()) == 0: # pylint:disable=g-explicit-length-test
angles = angles[None]
elif len(angles.get_shape()) != 1:
raise TypeError('Angles should have a rank 0 or 1.')
x_offset = ((image_width - 1) -
(tf.math.cos(angles) * (image_width - 1) - tf.math.sin(angles) *
(image_height - 1))) / 2.0
y_offset = ((image_height - 1) -
(tf.math.sin(angles) * (image_width - 1) + tf.math.cos(angles) *
(image_height - 1))) / 2.0
num_angles = tf.shape(angles)[0]
return tf.concat(
values=[
tf.math.cos(angles)[:, None],
-tf.math.sin(angles)[:, None],
x_offset[:, None],
tf.math.sin(angles)[:, None],
tf.math.cos(angles)[:, None],
y_offset[:, None],
tf.zeros((num_angles, 2), tf.dtypes.float32),
],
axis=1,
)
def apply_transform_to_images(
images,
transforms,
fill_mode='reflect',
fill_value=0.0,
interpolation='bilinear',
output_shape=None,
name=None,
):
"""Applies the given transform(s) to the image(s).
Args:
images: A tensor of shape `(num_images, num_rows, num_columns,
num_channels)` (NHWC). The rank must be statically known (the shape is
not `TensorShape(None)`).
transforms: Projective transform matrix/matrices. A vector of length 8 or
tensor of size N x 8. If one row of transforms is [a0, a1, a2, b0, b1,
b2, c0, c1], then it maps the *output* point `(x, y)` to a transformed
*input* point `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) /
k)`, where `k = c0 x + c1 y + 1`. The transforms are *inverted* compared
to the transform mapping input points to output points. Note that
gradients are not backpropagated into transformation parameters.
fill_mode: Points outside the boundaries of the input are filled according
to the given mode (one of `{"constant", "reflect", "wrap", "nearest"}`).
fill_value: a float represents the value to be filled outside the
boundaries when `fill_mode="constant"`.
interpolation: Interpolation mode. Supported values: `"nearest"`,
`"bilinear"`.
output_shape: Output dimension after the transform, `[height, width]`. If
`None`, output is the same size as input image.
name: The name of the op. Fill mode behavior for each valid value is as
follows
- `"reflect"`: `(d c b a | a b c d | d c b a)` The input is extended by
reflecting about the edge of the last pixel.
- `"constant"`: `(k k k k | a b c d | k k k k)` The input is extended by
filling all values beyond the edge with the same constant value k = 0.
- `"wrap"`: `(a b c d | a b c d | a b c d)` The input is extended by
wrapping around to the opposite edge.
- `"nearest"`: `(a a a a | a b c d | d d d d)` The input is extended by
the nearest pixel. Input shape: 4D tensor with shape:
`(samples, height, width, channels)`, in `"channels_last"` format.
Output shape: 4D tensor with shape: `(samples, height, width, channels)`,
in `"channels_last"` format.
Returns:
Image(s) with the same type and shape as `images`, with the given
transform(s) applied. Transformed coordinates outside of the input image
will be filled with zeros.
"""
with tf.name_scope(name or 'transform'):
if output_shape is None:
output_shape = tf.shape(images)[1:3]
if not tf.executing_eagerly():
output_shape_value = tf.get_static_value(output_shape)
if output_shape_value is not None:
output_shape = output_shape_value
output_shape = tf.convert_to_tensor(
output_shape, tf.int32, name='output_shape'
)
if not output_shape.get_shape().is_compatible_with([2]):
raise ValueError(
'output_shape must be a 1-D Tensor of 2 elements: '
'new_height, new_width, instead got '
f'output_shape={output_shape}'
)
fill_value = tf.convert_to_tensor(fill_value, tf.float32, name='fill_value')
return tf.raw_ops.ImageProjectiveTransformV3(
images=images,
output_shape=output_shape,
fill_value=fill_value,
transforms=transforms,
fill_mode=fill_mode.upper(),
interpolation=interpolation.upper(),
)
def transform(image: tf.Tensor, transforms) -> tf.Tensor:
"""Prepares input data for `image_ops.transform`."""
original_ndims = tf.rank(image)
transforms = tf.convert_to_tensor(transforms, dtype=tf.float32)
if transforms.shape.rank == 1:
transforms = transforms[None]
image = to_4d(image)
image = apply_transform_to_images(
images=image, transforms=transforms, interpolation='nearest'
)
return from_4d(image, original_ndims)
def translate(image: tf.Tensor, translations) -> tf.Tensor:
"""Translates image(s) by provided vectors.
Args:
image: An image Tensor of type uint8.
translations: A vector or matrix representing [dx dy].
Returns:
The translated version of the image.
"""
transforms = _convert_translation_to_transform(translations) # pytype: disable=wrong-arg-types # always-use-return-annotations
return transform(image, transforms=transforms)
def rotate(image: tf.Tensor, degrees: float) -> tf.Tensor:
"""Rotates the image by degrees either clockwise or counterclockwise.
Args:
image: An image Tensor of type uint8.
degrees: Float, a scalar angle in degrees to rotate all images by. If
degrees is positive the image will be rotated clockwise otherwise it will
be rotated counterclockwise.
Returns:
The rotated version of image.
"""
# Convert from degrees to radians.
degrees_to_radians = math.pi / 180.0
radians = tf.cast(degrees * degrees_to_radians, tf.float32)
original_ndims = tf.rank(image)
image = to_4d(image)
image_height = tf.cast(tf.shape(image)[1], tf.float32)
image_width = tf.cast(tf.shape(image)[2], tf.float32)
transforms = _convert_angles_to_transform(
angles=radians, image_width=image_width, image_height=image_height)
# In practice, we should randomize the rotation degrees by flipping
# it negatively half the time, but that's done on 'degrees' outside
# of the function.
image = transform(image, transforms=transforms)
return from_4d(image, original_ndims)
def blend(image1: tf.Tensor, image2: tf.Tensor, factor: float) -> tf.Tensor:
"""Blend image1 and image2 using 'factor'.
Factor can be above 0.0. A value of 0.0 means only image1 is used.
A value of 1.0 means only image2 is used. A value between 0.0 and
1.0 means we linearly interpolate the pixel values between the two
images. A value greater than 1.0 "extrapolates" the difference
between the two pixel values, and we clip the results to values
between 0 and 255.
Args:
image1: An image Tensor of type uint8.
image2: An image Tensor of type uint8.
factor: A floating point value above 0.0.
Returns:
A blended image Tensor of type uint8.
"""
if factor == 0.0:
return tf.convert_to_tensor(image1)
if factor == 1.0:
return tf.convert_to_tensor(image2)
image1 = tf.cast(image1, tf.float32)
image2 = tf.cast(image2, tf.float32)
difference = image2 - image1
scaled = factor * difference
# Do addition in float.
temp = tf.cast(image1, tf.float32) + scaled
# Interpolate
if factor > 0.0 and factor < 1.0:
# Interpolation means we always stay within 0 and 255.
return tf.cast(temp, tf.uint8)
# Extrapolate:
#
# We need to clip and then cast.
return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8)
def cutout(image: tf.Tensor, pad_size: int, replace: int = 0) -> tf.Tensor:
"""Apply cutout (https://arxiv.org/abs/1708.04552) to image.
This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
a random location within `img`. The pixel values filled in will be of the
value `replace`. The located where the mask will be applied is randomly
chosen uniformly over the whole image.
Args:
image: An image Tensor of type uint8.
pad_size: Specifies how big the zero mask that will be generated is that is
applied to the image. The mask will be of size (2*pad_size x 2*pad_size).
replace: What pixel value to fill in the image in the area that has the
cutout mask applied to it.
Returns:
An image Tensor that is of type uint8.
"""
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
# Sample the center location in the image where the zero mask will be applied.
cutout_center_height = tf.random.uniform(
shape=[], minval=0, maxval=image_height, dtype=tf.int32)
cutout_center_width = tf.random.uniform(
shape=[], minval=0, maxval=image_width, dtype=tf.int32)
lower_pad = tf.maximum(0, cutout_center_height - pad_size)
upper_pad = tf.maximum(0, image_height - cutout_center_height - pad_size)
left_pad = tf.maximum(0, cutout_center_width - pad_size)
right_pad = tf.maximum(0, image_width - cutout_center_width - pad_size)
cutout_shape = [
image_height - (lower_pad + upper_pad),
image_width - (left_pad + right_pad)
]
padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
mask = tf.pad(
tf.zeros(cutout_shape, dtype=image.dtype),
padding_dims,
constant_values=1)
mask = tf.expand_dims(mask, -1)
mask = tf.tile(mask, [1, 1, 3])
image = tf.where(
tf.equal(mask, 0),
tf.ones_like(image, dtype=image.dtype) * replace, image)
return image
def solarize(image: tf.Tensor, threshold: int = 128) -> tf.Tensor:
# For each pixel in the image, select the pixel
# if the value is less than the threshold.
# Otherwise, subtract 255 from the pixel.
return tf.where(image < threshold, image, 255 - image)
def solarize_add(image: tf.Tensor,
addition: int = 0,
threshold: int = 128) -> tf.Tensor:
# For each pixel in the image less than threshold
# we add 'addition' amount to it and then clip the
# pixel value to be between 0 and 255. The value
# of 'addition' is between -128 and 128.
added_image = tf.cast(image, tf.int64) + addition
added_image = tf.cast(tf.clip_by_value(added_image, 0, 255), tf.uint8)
return tf.where(image < threshold, added_image, image)
def color(image: tf.Tensor, factor: float) -> tf.Tensor:
"""Equivalent of PIL Color."""
degenerate = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image))
return blend(degenerate, image, factor)
def contrast(image: tf.Tensor, factor: float) -> tf.Tensor:
"""Equivalent of PIL Contrast."""
degenerate = tf.image.rgb_to_grayscale(image)
# Cast before calling tf.histogram.
degenerate = tf.cast(degenerate, tf.int32)
# Compute the grayscale histogram, then compute the mean pixel value,
# and create a constant image size of that value. Use that as the
# blending degenerate target of the original image.
hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256)
mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0
degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean
degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8))
return blend(degenerate, image, factor)
def brightness(image: tf.Tensor, factor: float) -> tf.Tensor:
"""Equivalent of PIL Brightness."""
degenerate = tf.zeros_like(image)
return blend(degenerate, image, factor)
def posterize(image: tf.Tensor, bits: int) -> tf.Tensor:
"""Equivalent of PIL Posterize."""
shift = 8 - bits
return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift)
def wrapped_rotate(image: tf.Tensor, degrees: float, replace: int) -> tf.Tensor:
"""Applies rotation with wrap/unwrap."""
image = rotate(wrap(image), degrees=degrees)
return unwrap(image, replace)
def translate_x(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
"""Equivalent of PIL Translate in X dimension."""
image = translate(wrap(image), [-pixels, 0])
return unwrap(image, replace)
def translate_y(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
"""Equivalent of PIL Translate in Y dimension."""
image = translate(wrap(image), [0, -pixels])
return unwrap(image, replace)
def shear_x(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
"""Equivalent of PIL Shearing in X dimension."""
# Shear parallel to x axis is a projective transform
# with a matrix form of:
# [1 level
# 0 1].
image = transform(
image=wrap(image), transforms=[1., level, 0., 0., 1., 0., 0., 0.])
return unwrap(image, replace)
def shear_y(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
"""Equivalent of PIL Shearing in Y dimension."""
# Shear parallel to y axis is a projective transform
# with a matrix form of:
# [1 0
# level 1].
image = transform(
image=wrap(image), transforms=[1., 0., 0., level, 1., 0., 0., 0.])
return unwrap(image, replace)
def autocontrast(image: tf.Tensor) -> tf.Tensor:
"""Implements Autocontrast function from PIL using TF ops.
Args:
image: A 3D uint8 tensor.
Returns:
The image after it has had autocontrast applied to it and will be of type
uint8.
"""
def scale_channel(image: tf.Tensor) -> tf.Tensor:
"""Scale the 2D image using the autocontrast rule."""
# A possibly cheaper version can be done using cumsum/unique_with_counts
# over the histogram values, rather than iterating over the entire image.
# to compute mins and maxes.
lo = tf.cast(tf.reduce_min(image), tf.float32)
hi = tf.cast(tf.reduce_max(image), tf.float32)
# Scale the image, making the lowest value 0 and the highest value 255.
def scale_values(im):
scale = 255.0 / (hi - lo)
offset = -lo * scale
im = tf.cast(im, tf.float32) * scale + offset
im = tf.clip_by_value(im, 0.0, 255.0)
return tf.cast(im, tf.uint8)
result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image)
return result
# Assumes RGB for now. Scales each channel independently
# and then stacks the result.
s1 = scale_channel(image[:, :, 0])
s2 = scale_channel(image[:, :, 1])
s3 = scale_channel(image[:, :, 2])
image = tf.stack([s1, s2, s3], 2)
return image
def sharpness(image: tf.Tensor, factor: float) -> tf.Tensor:
"""Implements Sharpness function from PIL using TF ops."""
orig_image = image
image = tf.cast(image, tf.float32)
# Make image 4D for conv operation.
image = tf.expand_dims(image, 0)
# SMOOTH PIL Kernel.
kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]],
dtype=tf.float32,
shape=[3, 3, 1, 1]) / 13.
# Tile across channel dimension.
kernel = tf.tile(kernel, [1, 1, 3, 1])
strides = [1, 1, 1, 1]
degenerate = tf.nn.depthwise_conv2d(
image, kernel, strides, padding='VALID', dilations=[1, 1])
degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0])
# For the borders of the resulting image, fill in the values of the
# original image.
mask = tf.ones_like(degenerate)
padded_mask = tf.pad(mask, [[1, 1], [1, 1], [0, 0]])
padded_degenerate = tf.pad(degenerate, [[1, 1], [1, 1], [0, 0]])
result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image)
# Blend the final result.
return blend(result, orig_image, factor)
def equalize(image: tf.Tensor) -> tf.Tensor:
"""Implements Equalize function from PIL using TF ops."""
def scale_channel(im, c):
"""Scale the data in the channel to implement equalize."""
im = tf.cast(im[:, :, c], tf.int32)
# Compute the histogram of the image channel.
histo = tf.histogram_fixed_width(im, [0, 255], nbins=256)
# For the purposes of computing the step, filter out the nonzeros.
nonzero = tf.where(tf.not_equal(histo, 0))
nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1])
step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255
def build_lut(histo, step):
# Compute the cumulative sum, shifting by step // 2
# and then normalization by step.
lut = (tf.cumsum(histo) + (step // 2)) // step
# Shift lut, prepending with 0.
lut = tf.concat([[0], lut[:-1]], 0)
# Clip the counts to be in range. This is done
# in the C code for image.point.
return tf.clip_by_value(lut, 0, 255)
# If step is zero, return the original image. Otherwise, build
# lut from the full histogram and step and then index from it.
result = tf.cond(
tf.equal(step, 0), lambda: im,
lambda: tf.gather(build_lut(histo, step), im))
return tf.cast(result, tf.uint8)
# Assumes RGB for now. Scales each channel independently
# and then stacks the result.
s1 = scale_channel(image, 0)
s2 = scale_channel(image, 1)
s3 = scale_channel(image, 2)
image = tf.stack([s1, s2, s3], 2)
return image
def invert(image: tf.Tensor) -> tf.Tensor:
"""Inverts the image pixels."""
image = tf.convert_to_tensor(image)
return 255 - image
def wrap(image: tf.Tensor) -> tf.Tensor:
"""Returns 'image' with an extra channel set to all 1s."""
shape = tf.shape(image)
extended_channel = tf.ones([shape[0], shape[1], 1], image.dtype)
extended = tf.concat([image, extended_channel], axis=2)
return extended
def unwrap(image: tf.Tensor, replace: int) -> tf.Tensor:
"""Unwraps an image produced by wrap.
Where there is a 0 in the last channel for every spatial position,
the rest of the three channels in that spatial dimension are grayed
(set to 128). Operations like translate and shear on a wrapped
Tensor will leave 0s in empty locations. Some transformations look
at the intensity of values to do preprocessing, and we want these
empty pixels to assume the 'average' value, rather than pure black.
Args:
image: A 3D Image Tensor with 4 channels.
replace: A one or three value 1D tensor to fill empty pixels.
Returns:
image: A 3D image Tensor with 3 channels.
"""
image_shape = tf.shape(image)
# Flatten the spatial dimensions.
flattened_image = tf.reshape(image, [-1, image_shape[2]])
# Find all pixels where the last channel is zero.
alpha_channel = tf.expand_dims(flattened_image[:, 3], axis=-1)
replace = tf.concat([replace, tf.ones([1], image.dtype)], 0)
# Where they are zero, fill them in with 'replace'.
flattened_image = tf.where(
tf.equal(alpha_channel, 0),
tf.ones_like(flattened_image, dtype=image.dtype) * replace,
flattened_image)
image = tf.reshape(flattened_image, image_shape)
image = tf.slice(image, [0, 0, 0], [image_shape[0], image_shape[1], 3])
return image
def _randomly_negate_tensor(tensor):
"""With 50% prob turn the tensor negative."""
should_flip = tf.cast(tf.floor(tf.random.uniform([]) + 0.5), tf.bool)
final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor)
return final_tensor
def _rotate_level_to_arg(level: float):
level = (level / _MAX_LEVEL) * 30.
level = _randomly_negate_tensor(level)
return (level,)
def _shrink_level_to_arg(level: float):
"""Converts level to ratio by which we shrink the image content."""
if level == 0:
return (1.0,) # if level is zero, do not shrink the image
# Maximum shrinking ratio is 2.9.
level = 2. / (_MAX_LEVEL / level) + 0.9
return (level,)
def _enhance_level_to_arg(level: float):
return ((level / _MAX_LEVEL) * 1.8 + 0.1,)
def _shear_level_to_arg(level: float):
level = (level / _MAX_LEVEL) * 0.3
# Flip level to negative with 50% chance.
level = _randomly_negate_tensor(level)
return (level,)
def _translate_level_to_arg(level: float, translate_const: float):
level = (level / _MAX_LEVEL) * float(translate_const)
# Flip level to negative with 50% chance.
level = _randomly_negate_tensor(level)
return (level,)
def _mult_to_arg(level: float, multiplier: float = 1.):
return (int((level / _MAX_LEVEL) * multiplier),)
def _apply_func_with_prob(func: Any, image: tf.Tensor, args: Any, prob: float):
"""Apply `func` to image w/ `args` as input with probability `prob`."""
assert isinstance(args, tuple)
# Apply the function with probability `prob`.
should_apply_op = tf.cast(
tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool)
augmented_image = tf.cond(should_apply_op, lambda: func(image, *args),
lambda: image)
return augmented_image
def select_and_apply_random_policy(policies: Any, image: tf.Tensor):
"""Select a random policy from `policies` and apply it to `image`."""
policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32)
# Note that using tf.case instead of tf.conds would result in significantly
# larger graphs and would even break export for some larger policies.
for (i, policy) in enumerate(policies):
image = tf.cond(
tf.equal(i, policy_to_select),
lambda selected_policy=policy: selected_policy(image),
lambda: image)
return image
NAME_TO_FUNC = {
'AutoContrast': autocontrast,
'Equalize': equalize,
'Invert': invert,
'Rotate': wrapped_rotate,
'Posterize': posterize,
'Solarize': solarize,
'SolarizeAdd': solarize_add,
'Color': color,
'Contrast': contrast,
'Brightness': brightness,
'Sharpness': sharpness,
'ShearX': shear_x,
'ShearY': shear_y,
'TranslateX': translate_x,
'TranslateY': translate_y,
'Cutout': cutout,
}
# Functions that have a 'replace' parameter
REPLACE_FUNCS = frozenset({
'Rotate',
'TranslateX',
'ShearX',
'ShearY',
'TranslateY',
'Cutout',
})
def level_to_arg(cutout_const: float, translate_const: float):
"""Creates a dict mapping image operation names to their arguments."""
no_arg = lambda level: ()
posterize_arg = lambda level: _mult_to_arg(level, 4)
solarize_arg = lambda level: _mult_to_arg(level, 256)
solarize_add_arg = lambda level: _mult_to_arg(level, 110)
cutout_arg = lambda level: _mult_to_arg(level, cutout_const)
translate_arg = lambda level: _translate_level_to_arg(level, translate_const)
args = {
'AutoContrast': no_arg,
'Equalize': no_arg,
'Invert': no_arg,
'Rotate': _rotate_level_to_arg,
'Posterize': posterize_arg,
'Solarize': solarize_arg,
'SolarizeAdd': solarize_add_arg,
'Color': _enhance_level_to_arg,
'Contrast': _enhance_level_to_arg,
'Brightness': _enhance_level_to_arg,
'Sharpness': _enhance_level_to_arg,
'ShearX': _shear_level_to_arg,
'ShearY': _shear_level_to_arg,
'Cutout': cutout_arg,
'TranslateX': translate_arg,
'TranslateY': translate_arg,
}
return args
def _parse_policy_info(name: Text, prob: float, level: float,
replace_value: List[int], cutout_const: float,
translate_const: float) -> Tuple[Any, float, Any]:
"""Return the function that corresponds to `name` and update `level` param."""
func = NAME_TO_FUNC[name]
args = level_to_arg(cutout_const, translate_const)[name](level)
if name in REPLACE_FUNCS:
# Add in replace arg if it is required for the function that is called.
args = tuple(list(args) + [replace_value])
return func, prob, args
class ImageAugment(object):
"""Image augmentation class for applying image distortions."""
def distort(self, image: tf.Tensor) -> tf.Tensor:
"""Given an image tensor, returns a distorted image with the same shape.
Args:
image: `Tensor` of shape [height, width, 3] representing an image.
Returns:
The augmented version of `image`.
"""
raise NotImplementedError()
class AutoAugment(ImageAugment):
"""Applies the AutoAugment policy to images.
AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
"""
def __init__(self,
augmentation_name: Text = 'v0',
policies: Optional[Dict[Text, Any]] = None,
cutout_const: float = 100,
translate_const: float = 250):
"""Applies the AutoAugment policy to images.
Args:
augmentation_name: The name of the AutoAugment policy to use. The
available options are `v0` and `test`. `v0` is the policy used for all
of the results in the paper and was found to achieve the best results on
the COCO dataset. `v1`, `v2` and `v3` are additional good policies found
on the COCO dataset that have slight variation in what operations were
used during the search procedure along with how many operations are
applied in parallel to a single image (2 vs 3).
policies: list of lists of tuples in the form `(func, prob, level)`,
`func` is a string name of the augmentation function, `prob` is the
probability of applying the `func` operation, `level` is the input
argument for `func`.
cutout_const: multiplier for applying cutout.
translate_const: multiplier for applying translation.
"""
super(AutoAugment, self).__init__()
if policies is None:
self.available_policies = {
'v0': self.policy_v0(),
'test': self.policy_test(),
'simple': self.policy_simple(),
}
if augmentation_name not in self.available_policies:
raise ValueError(
'Invalid augmentation_name: {}'.format(augmentation_name))
self.augmentation_name = augmentation_name
self.policies = self.available_policies[augmentation_name]
self.cutout_const = float(cutout_const)
self.translate_const = float(translate_const)
def distort(self, image: tf.Tensor) -> tf.Tensor:
"""Applies the AutoAugment policy to `image`.
AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
Args:
image: `Tensor` of shape [height, width, 3] representing an image.
Returns:
A version of image that now has data augmentation applied to it based on
the `policies` pass into the function.
"""
input_image_type = image.dtype
if input_image_type != tf.uint8:
image = tf.clip_by_value(image, 0.0, 255.0)
image = tf.cast(image, dtype=tf.uint8)
replace_value = [128] * 3
# func is the string name of the augmentation function, prob is the
# probability of applying the operation and level is the parameter
# associated with the tf op.
# tf_policies are functions that take in an image and return an augmented
# image.
tf_policies = []
for policy in self.policies:
tf_policy = []
# Link string name to the correct python function and make sure the
# correct argument is passed into that function.
for policy_info in policy:
policy_info = list(policy_info) + [
replace_value, self.cutout_const, self.translate_const
]
tf_policy.append(_parse_policy_info(*policy_info))
# Now build the tf policy that will apply the augmentation procedue
# on image.
def make_final_policy(tf_policy_):
def final_policy(image_):
for func, prob, args in tf_policy_:
image_ = _apply_func_with_prob(func, image_, args, prob)
return image_
return final_policy
tf_policies.append(make_final_policy(tf_policy))
image = select_and_apply_random_policy(tf_policies, image)
image = tf.cast(image, dtype=input_image_type)
return image
@staticmethod
def policy_v0():
"""Autoaugment policy that was used in AutoAugment Paper.
Each tuple is an augmentation operation of the form
(operation, probability, magnitude). Each element in policy is a
sub-policy that will be applied sequentially on the image.
Returns:
the policy.
"""
policy = [
[('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
[('Color', 0.4, 9), ('Equalize', 0.6, 3)],
[('Color', 0.4, 1), ('Rotate', 0.6, 8)],
[('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
[('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
[('Color', 0.2, 0), ('Equalize', 0.8, 8)],
[('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
[('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
[('Color', 0.6, 1), ('Equalize', 1.0, 2)],
[('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
[('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
[('Color', 0.4, 7), ('Equalize', 0.6, 0)],
[('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
[('Solarize', 0.6, 8), ('Color', 0.6, 9)],
[('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
[('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
[('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
[('ShearY', 0.8, 0), ('Color', 0.6, 4)],
[('Color', 1.0, 0), ('Rotate', 0.6, 2)],
[('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
[('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
[('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
[('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
[('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
[('Color', 0.8, 6), ('Rotate', 0.4, 5)],
]
return policy
@staticmethod
def policy_simple():
"""Same as `policy_v0`, except with custom ops removed."""
policy = [
[('Color', 0.4, 9), ('Equalize', 0.6, 3)],
[('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
[('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
[('Color', 0.2, 0), ('Equalize', 0.8, 8)],
[('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
[('Color', 0.6, 1), ('Equalize', 1.0, 2)],
[('Color', 0.4, 7), ('Equalize', 0.6, 0)],
[('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
[('Solarize', 0.6, 8), ('Color', 0.6, 9)],
[('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
[('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
[('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
[('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
]
return policy
@staticmethod
def policy_test():
"""Autoaugment test policy for debugging."""
policy = [
[('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)],
]
return policy
class RandAugment(ImageAugment):
"""Applies the RandAugment policy to images.
RandAugment is from the paper https://arxiv.org/abs/1909.13719,
"""
def __init__(self,
num_layers: int = 2,
magnitude: float = 10.,
cutout_const: float = 40.,
translate_const: float = 100.):
"""Applies the RandAugment policy to images.
Args:
num_layers: Integer, the number of augmentation transformations to apply
sequentially to an image. Represented as (N) in the paper. Usually best
values will be in the range [1, 3].
magnitude: Integer, shared magnitude across all augmentation operations.
Represented as (M) in the paper. Usually best values are in the range
[5, 10].
cutout_const: multiplier for applying cutout.
translate_const: multiplier for applying translation.
"""
super(RandAugment, self).__init__()
self.num_layers = num_layers
self.magnitude = float(magnitude)
self.cutout_const = float(cutout_const)
self.translate_const = float(translate_const)
self.available_ops = [
'AutoContrast', 'Equalize', 'Invert', 'Rotate', 'Posterize', 'Solarize',
'Color', 'Contrast', 'Brightness', 'Sharpness', 'ShearX', 'ShearY',
'TranslateX', 'TranslateY', 'Cutout', 'SolarizeAdd'
]
def distort(self, image: tf.Tensor) -> tf.Tensor:
"""Applies the RandAugment policy to `image`.
Args:
image: `Tensor` of shape [height, width, 3] representing an image.
Returns:
The augmented version of `image`.
"""
input_image_type = image.dtype
if input_image_type != tf.uint8:
image = tf.clip_by_value(image, 0.0, 255.0)
image = tf.cast(image, dtype=tf.uint8)
replace_value = [128] * 3
min_prob, max_prob = 0.2, 0.8
for _ in range(self.num_layers):
op_to_select = tf.random.uniform([],
maxval=len(self.available_ops) + 1,
dtype=tf.int32)
branch_fns = []
for (i, op_name) in enumerate(self.available_ops):
prob = tf.random.uniform([],
minval=min_prob,
maxval=max_prob,
dtype=tf.float32)
func, _, args = _parse_policy_info(op_name, prob, self.magnitude,
replace_value, self.cutout_const,
self.translate_const)
branch_fns.append((
i,
# pylint:disable=g-long-lambda
lambda selected_func=func, selected_args=args: selected_func(
image, *selected_args)))
# pylint:enable=g-long-lambda
image = tf.switch_case(
branch_index=op_to_select,
branch_fns=branch_fns,
default=lambda: tf.identity(image))
image = tf.cast(image, dtype=input_image_type)
return image
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for autoaugment."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl.testing import parameterized
import tensorflow as tf
from official.legacy.image_classification import augment
def get_dtype_test_cases():
return [
('uint8', tf.uint8),
('int32', tf.int32),
('float16', tf.float16),
('float32', tf.float32),
]
@parameterized.named_parameters(get_dtype_test_cases())
class TransformsTest(parameterized.TestCase, tf.test.TestCase):
"""Basic tests for fundamental transformations."""
def test_to_from_4d(self, dtype):
for shape in [(10, 10), (10, 10, 10), (10, 10, 10, 10)]:
original_ndims = len(shape)
image = tf.zeros(shape, dtype=dtype)
image_4d = augment.to_4d(image)
self.assertEqual(4, tf.rank(image_4d))
self.assertAllEqual(image, augment.from_4d(image_4d, original_ndims))
def test_transform(self, dtype):
image = tf.constant([[1, 2], [3, 4]], dtype=dtype)
self.assertAllEqual(
augment.transform(image, transforms=[1] * 8), [[4, 4], [4, 4]])
def test_translate(self, dtype):
image = tf.constant(
[[1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]], dtype=dtype)
translations = [-1, -1]
translated = augment.translate(image=image, translations=translations)
expected = [[1, 0, 1, 1], [0, 1, 0, 0], [1, 0, 1, 1], [1, 0, 1, 1]]
self.assertAllEqual(translated, expected)
def test_translate_shapes(self, dtype):
translation = [0, 0]
for shape in [(3, 3), (5, 5), (224, 224, 3)]:
image = tf.zeros(shape, dtype=dtype)
self.assertAllEqual(image, augment.translate(image, translation))
def test_translate_invalid_translation(self, dtype):
image = tf.zeros((1, 1), dtype=dtype)
invalid_translation = [[[1, 1]]]
with self.assertRaisesRegex(TypeError, 'rank 1 or 2'):
_ = augment.translate(image, invalid_translation)
def test_rotate(self, dtype):
image = tf.reshape(tf.cast(tf.range(9), dtype), (3, 3))
rotation = 90.
transformed = augment.rotate(image=image, degrees=rotation)
expected = [[2, 5, 8], [1, 4, 7], [0, 3, 6]]
self.assertAllEqual(transformed, expected)
def test_rotate_shapes(self, dtype):
degrees = 0.
for shape in [(3, 3), (5, 5), (224, 224, 3)]:
image = tf.zeros(shape, dtype=dtype)
self.assertAllEqual(image, augment.rotate(image, degrees))
class AutoaugmentTest(tf.test.TestCase):
def test_autoaugment(self):
"""Smoke test to be sure there are no syntax errors."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
augmenter = augment.AutoAugment()
aug_image = augmenter.distort(image)
self.assertEqual((224, 224, 3), aug_image.shape)
def test_randaug(self):
"""Smoke test to be sure there are no syntax errors."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
augmenter = augment.RandAugment()
aug_image = augmenter.distort(image)
self.assertEqual((224, 224, 3), aug_image.shape)
def test_all_policy_ops(self):
"""Smoke test to be sure all augmentation functions can execute."""
prob = 1
magnitude = 10
replace_value = [128] * 3
cutout_const = 100
translate_const = 250
image = tf.ones((224, 224, 3), dtype=tf.uint8)
for op_name in augment.NAME_TO_FUNC:
func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
replace_value, cutout_const,
translate_const)
image = func(image, *args)
self.assertEqual((224, 224, 3), image.shape)
if __name__ == '__main__':
tf.test.main()
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Common modules for callbacks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from typing import Any, List, MutableMapping, Optional, Text
from absl import logging
import tensorflow as tf
from official.modeling import optimization
from official.utils.misc import keras_utils
def get_callbacks(
model_checkpoint: bool = True,
include_tensorboard: bool = True,
time_history: bool = True,
track_lr: bool = True,
write_model_weights: bool = True,
apply_moving_average: bool = False,
initial_step: int = 0,
batch_size: int = 0,
log_steps: int = 0,
model_dir: Optional[str] = None,
backup_and_restore: bool = False) -> List[tf.keras.callbacks.Callback]:
"""Get all callbacks."""
model_dir = model_dir or ''
callbacks = []
if model_checkpoint:
ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
callbacks.append(
tf.keras.callbacks.ModelCheckpoint(
ckpt_full_path, save_weights_only=True, verbose=1))
if backup_and_restore:
backup_dir = os.path.join(model_dir, 'tmp')
callbacks.append(
tf.keras.callbacks.experimental.BackupAndRestore(backup_dir))
if include_tensorboard:
callbacks.append(
CustomTensorBoard(
log_dir=model_dir,
track_lr=track_lr,
initial_step=initial_step,
write_images=write_model_weights,
profile_batch=0))
if time_history:
callbacks.append(
keras_utils.TimeHistory(
batch_size,
log_steps,
logdir=model_dir if include_tensorboard else None))
if apply_moving_average:
# Save moving average model to a different file so that
# we can resume training from a checkpoint
ckpt_full_path = os.path.join(model_dir, 'average',
'model.ckpt-{epoch:04d}')
callbacks.append(
AverageModelCheckpoint(
update_weights=False,
filepath=ckpt_full_path,
save_weights_only=True,
verbose=1))
callbacks.append(MovingAverageCallback())
return callbacks
def get_scalar_from_tensor(t: tf.Tensor) -> int:
"""Utility function to convert a Tensor to a scalar."""
t = tf.keras.backend.get_value(t)
if callable(t):
return t()
else:
return t
class CustomTensorBoard(tf.keras.callbacks.TensorBoard):
"""A customized TensorBoard callback that tracks additional datapoints.
Metrics tracked:
- Global learning rate
Attributes:
log_dir: the path of the directory where to save the log files to be parsed
by TensorBoard.
track_lr: `bool`, whether or not to track the global learning rate.
initial_step: the initial step, used for preemption recovery.
**kwargs: Additional arguments for backwards compatibility. Possible key is
`period`.
"""
# TODO(b/146499062): track params, flops, log lr, l2 loss,
# classification loss
def __init__(self,
log_dir: str,
track_lr: bool = False,
initial_step: int = 0,
**kwargs):
super(CustomTensorBoard, self).__init__(log_dir=log_dir, **kwargs)
self.step = initial_step
self._track_lr = track_lr
def on_batch_begin(self,
epoch: int,
logs: Optional[MutableMapping[str, Any]] = None) -> None:
self.step += 1
if logs is None:
logs = {}
logs.update(self._calculate_metrics())
super(CustomTensorBoard, self).on_batch_begin(epoch, logs)
def on_epoch_begin(self,
epoch: int,
logs: Optional[MutableMapping[str, Any]] = None) -> None:
if logs is None:
logs = {}
metrics = self._calculate_metrics()
logs.update(metrics)
for k, v in metrics.items():
logging.info('Current %s: %f', k, v)
super(CustomTensorBoard, self).on_epoch_begin(epoch, logs)
def on_epoch_end(self,
epoch: int,
logs: Optional[MutableMapping[str, Any]] = None) -> None:
if logs is None:
logs = {}
metrics = self._calculate_metrics()
logs.update(metrics)
super(CustomTensorBoard, self).on_epoch_end(epoch, logs)
def _calculate_metrics(self) -> MutableMapping[str, Any]:
logs = {}
# TODO(b/149030439): disable LR reporting.
# if self._track_lr:
# logs['learning_rate'] = self._calculate_lr()
return logs
def _calculate_lr(self) -> int:
"""Calculates the learning rate given the current step."""
return get_scalar_from_tensor(
self._get_base_optimizer()._decayed_lr(var_dtype=tf.float32)) # pylint:disable=protected-access
def _get_base_optimizer(self) -> tf.keras.optimizers.Optimizer:
"""Get the base optimizer used by the current model."""
optimizer = self.model.optimizer
# The optimizer might be wrapped by another class, so unwrap it
while hasattr(optimizer, '_optimizer'):
optimizer = optimizer._optimizer # pylint:disable=protected-access
return optimizer
class MovingAverageCallback(tf.keras.callbacks.Callback):
"""A Callback to be used with a `ExponentialMovingAverage` optimizer.
Applies moving average weights to the model during validation time to test
and predict on the averaged weights rather than the current model weights.
Once training is complete, the model weights will be overwritten with the
averaged weights (by default).
Attributes:
overwrite_weights_on_train_end: Whether to overwrite the current model
weights with the averaged weights from the moving average optimizer.
**kwargs: Any additional callback arguments.
"""
def __init__(self, overwrite_weights_on_train_end: bool = False, **kwargs):
super(MovingAverageCallback, self).__init__(**kwargs)
self.overwrite_weights_on_train_end = overwrite_weights_on_train_end
def set_model(self, model: tf.keras.Model):
super(MovingAverageCallback, self).set_model(model)
assert isinstance(self.model.optimizer,
optimization.ExponentialMovingAverage)
self.model.optimizer.shadow_copy(self.model)
def on_test_begin(self, logs: Optional[MutableMapping[Text, Any]] = None):
self.model.optimizer.swap_weights()
def on_test_end(self, logs: Optional[MutableMapping[Text, Any]] = None):
self.model.optimizer.swap_weights()
def on_train_end(self, logs: Optional[MutableMapping[Text, Any]] = None):
if self.overwrite_weights_on_train_end:
self.model.optimizer.assign_average_vars(self.model.variables)
class AverageModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
"""Saves and, optionally, assigns the averaged weights.
Taken from tfa.callbacks.AverageModelCheckpoint.
Attributes:
update_weights: If True, assign the moving average weights to the model, and
save them. If False, keep the old non-averaged weights, but the saved
model uses the average weights. See `tf.keras.callbacks.ModelCheckpoint`
for the other args.
"""
def __init__(self,
update_weights: bool,
filepath: str,
monitor: str = 'val_loss',
verbose: int = 0,
save_best_only: bool = False,
save_weights_only: bool = False,
mode: str = 'auto',
save_freq: str = 'epoch',
**kwargs):
self.update_weights = update_weights
super().__init__(filepath, monitor, verbose, save_best_only,
save_weights_only, mode, save_freq, **kwargs)
def set_model(self, model):
if not isinstance(model.optimizer, optimization.ExponentialMovingAverage):
raise TypeError('AverageModelCheckpoint is only used when training'
'with MovingAverage')
return super().set_model(model)
def _save_model(self, epoch, logs):
assert isinstance(self.model.optimizer,
optimization.ExponentialMovingAverage)
if self.update_weights:
self.model.optimizer.assign_average_vars(self.model.variables)
return super()._save_model(epoch, logs) # pytype: disable=attribute-error # typed-keras
else:
# Note: `model.get_weights()` gives us the weights (non-ref)
# whereas `model.variables` returns references to the variables.
non_avg_weights = self.model.get_weights()
self.model.optimizer.assign_average_vars(self.model.variables)
# result is currently None, since `super._save_model` doesn't
# return anything, but this may change in the future.
result = super()._save_model(epoch, logs) # pytype: disable=attribute-error # typed-keras
self.model.set_weights(non_avg_weights)
return result
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Runs an Image Classification model."""
import os
import pprint
from typing import Any, Mapping, Optional, Text, Tuple
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
from official.common import distribute_utils
from official.legacy.image_classification import callbacks as custom_callbacks
from official.legacy.image_classification import dataset_factory
from official.legacy.image_classification import optimizer_factory
from official.legacy.image_classification.configs import base_configs
from official.legacy.image_classification.configs import configs
from official.legacy.image_classification.efficientnet import efficientnet_model
from official.legacy.image_classification.resnet import common
from official.legacy.image_classification.resnet import resnet_model
from official.legacy.image_classification.vgg import vgg_model
from official.modeling import hyperparams
from official.modeling import performance
from official.utils import hyperparams_flags
from official.utils.misc import keras_utils
def get_models() -> Mapping[str, tf.keras.Model]:
"""Returns the mapping from model type name to Keras model."""
return {
'efficientnet': efficientnet_model.EfficientNet.from_name,
'resnet': resnet_model.resnet50,
'vgg': vgg_model.vgg16,
}
def get_dtype_map() -> Mapping[str, tf.dtypes.DType]:
"""Returns the mapping from dtype string representations to TF dtypes."""
return {
'float32': tf.float32,
'bfloat16': tf.bfloat16,
'float16': tf.float16,
'fp32': tf.float32,
'bf16': tf.bfloat16,
}
def _get_metrics(one_hot: bool) -> Mapping[Text, Any]:
"""Get a dict of available metrics to track."""
if one_hot:
return {
# (name, metric_fn)
'acc':
tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
'accuracy':
tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
'top_1':
tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
'top_5':
tf.keras.metrics.TopKCategoricalAccuracy(
k=5, name='top_5_accuracy'),
}
else:
return {
# (name, metric_fn)
'acc':
tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
'accuracy':
tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
'top_1':
tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
'top_5':
tf.keras.metrics.SparseTopKCategoricalAccuracy(
k=5, name='top_5_accuracy'),
}
def get_image_size_from_model(
params: base_configs.ExperimentConfig) -> Optional[int]:
"""If the given model has a preferred image size, return it."""
if params.model_name == 'efficientnet':
efficientnet_name = params.model.model_params.model_name
if efficientnet_name in efficientnet_model.MODEL_CONFIGS:
return efficientnet_model.MODEL_CONFIGS[efficientnet_name].resolution
return None
def _get_dataset_builders(params: base_configs.ExperimentConfig,
strategy: tf.distribute.Strategy,
one_hot: bool) -> Tuple[Any, Any]:
"""Create and return train and validation dataset builders."""
if one_hot:
logging.warning('label_smoothing > 0, so datasets will be one hot encoded.')
else:
logging.warning('label_smoothing not applied, so datasets will not be one '
'hot encoded.')
num_devices = strategy.num_replicas_in_sync if strategy else 1
image_size = get_image_size_from_model(params)
dataset_configs = [params.train_dataset, params.validation_dataset]
builders = []
for config in dataset_configs:
if config is not None and config.has_data:
builder = dataset_factory.DatasetBuilder(
config,
image_size=image_size or config.image_size,
num_devices=num_devices,
one_hot=one_hot)
else:
builder = None
builders.append(builder)
return builders
def get_loss_scale(params: base_configs.ExperimentConfig,
fp16_default: float = 128.) -> float:
"""Returns the loss scale for initializations."""
loss_scale = params.runtime.loss_scale
if loss_scale == 'dynamic':
return loss_scale
elif loss_scale is not None:
return float(loss_scale)
elif (params.train_dataset.dtype == 'float32' or
params.train_dataset.dtype == 'bfloat16'):
return 1.
else:
assert params.train_dataset.dtype == 'float16'
return fp16_default
def _get_params_from_flags(flags_obj: flags.FlagValues):
"""Get ParamsDict from flags."""
model = flags_obj.model_type.lower()
dataset = flags_obj.dataset.lower()
params = configs.get_config(model=model, dataset=dataset)
flags_overrides = {
'model_dir': flags_obj.model_dir,
'mode': flags_obj.mode,
'model': {
'name': model,
},
'runtime': {
'run_eagerly': flags_obj.run_eagerly,
'tpu': flags_obj.tpu,
},
'train_dataset': {
'data_dir': flags_obj.data_dir,
},
'validation_dataset': {
'data_dir': flags_obj.data_dir,
},
'train': {
'time_history': {
'log_steps': flags_obj.log_steps,
},
},
}
overriding_configs = (flags_obj.config_file, flags_obj.params_override,
flags_overrides)
pp = pprint.PrettyPrinter()
logging.info('Base params: %s', pp.pformat(params.as_dict()))
for param in overriding_configs:
logging.info('Overriding params: %s', param)
params = hyperparams.override_params_dict(params, param, is_strict=True)
params.validate()
params.lock()
logging.info('Final model parameters: %s', pp.pformat(params.as_dict()))
return params
def resume_from_checkpoint(model: tf.keras.Model, model_dir: str,
train_steps: int) -> int:
"""Resumes from the latest checkpoint, if possible.
Loads the model weights and optimizer settings from a checkpoint.
This function should be used in case of preemption recovery.
Args:
model: The model whose weights should be restored.
model_dir: The directory where model weights were saved.
train_steps: The number of steps to train.
Returns:
The epoch of the latest checkpoint, or 0 if not restoring.
"""
logging.info('Load from checkpoint is enabled.')
latest_checkpoint = tf.train.latest_checkpoint(model_dir)
logging.info('latest_checkpoint: %s', latest_checkpoint)
if not latest_checkpoint:
logging.info('No checkpoint detected.')
return 0
logging.info('Checkpoint file %s found and restoring from '
'checkpoint', latest_checkpoint)
model.load_weights(latest_checkpoint)
initial_epoch = model.optimizer.iterations // train_steps
logging.info('Completed loading from checkpoint.')
logging.info('Resuming from epoch %d', initial_epoch)
return int(initial_epoch)
def initialize(params: base_configs.ExperimentConfig,
dataset_builder: dataset_factory.DatasetBuilder):
"""Initializes backend related initializations."""
keras_utils.set_session_config(enable_xla=params.runtime.enable_xla)
performance.set_mixed_precision_policy(dataset_builder.dtype)
if tf.config.list_physical_devices('GPU'):
data_format = 'channels_first'
else:
data_format = 'channels_last'
tf.keras.backend.set_image_data_format(data_format)
if params.runtime.run_eagerly:
# Enable eager execution to allow step-by-step debugging
tf.config.experimental_run_functions_eagerly(True)
if tf.config.list_physical_devices('GPU'):
if params.runtime.gpu_thread_mode:
keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=params.runtime.per_gpu_thread_count,
gpu_thread_mode=params.runtime.gpu_thread_mode,
num_gpus=params.runtime.num_gpus,
datasets_num_private_threads=params.runtime
.dataset_num_private_threads) # pylint:disable=line-too-long
if params.runtime.batchnorm_spatial_persistent:
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
def define_classifier_flags():
"""Defines common flags for image classification."""
hyperparams_flags.initialize_common_flags()
flags.DEFINE_string(
'data_dir', default=None, help='The location of the input data.')
flags.DEFINE_string(
'mode',
default=None,
help='Mode to run: `train`, `eval`, `train_and_eval` or `export`.')
flags.DEFINE_bool(
'run_eagerly',
default=None,
help='Use eager execution and disable autograph for debugging.')
flags.DEFINE_string(
'model_type',
default=None,
help='The type of the model, e.g. EfficientNet, etc.')
flags.DEFINE_string(
'dataset',
default=None,
help='The name of the dataset, e.g. ImageNet, etc.')
flags.DEFINE_integer(
'log_steps',
default=100,
help='The interval of steps between logging of batch level stats.')
def serialize_config(params: base_configs.ExperimentConfig, model_dir: str):
"""Serializes and saves the experiment config."""
params_save_path = os.path.join(model_dir, 'params.yaml')
logging.info('Saving experiment configuration to %s', params_save_path)
tf.io.gfile.makedirs(model_dir)
hyperparams.save_params_dict_to_yaml(params, params_save_path)
def train_and_eval(
params: base_configs.ExperimentConfig,
strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]:
"""Runs the train and eval path using compile/fit."""
logging.info('Running train and eval.')
distribute_utils.configure_cluster(params.runtime.worker_hosts,
params.runtime.task_index)
# Note: for TPUs, strategy and scope should be created before the dataset
strategy = strategy_override or distribute_utils.get_distribution_strategy(
distribution_strategy=params.runtime.distribution_strategy,
all_reduce_alg=params.runtime.all_reduce_alg,
num_gpus=params.runtime.num_gpus,
tpu_address=params.runtime.tpu)
strategy_scope = distribute_utils.get_strategy_scope(strategy)
logging.info('Detected %d devices.',
strategy.num_replicas_in_sync if strategy else 1)
label_smoothing = params.model.loss.label_smoothing
one_hot = label_smoothing and label_smoothing > 0
builders = _get_dataset_builders(params, strategy, one_hot)
datasets = [
builder.build(strategy) if builder else None for builder in builders
]
# Unpack datasets and builders based on train/val/test splits
train_builder, validation_builder = builders # pylint: disable=unbalanced-tuple-unpacking
train_dataset, validation_dataset = datasets
train_epochs = params.train.epochs
train_steps = params.train.steps or train_builder.num_steps
validation_steps = params.evaluation.steps or validation_builder.num_steps
initialize(params, train_builder)
logging.info('Global batch size: %d', train_builder.global_batch_size)
with strategy_scope:
model_params = params.model.model_params.as_dict()
model = get_models()[params.model.name](**model_params)
learning_rate = optimizer_factory.build_learning_rate(
params=params.model.learning_rate,
batch_size=train_builder.global_batch_size,
train_epochs=train_epochs,
train_steps=train_steps)
optimizer = optimizer_factory.build_optimizer(
optimizer_name=params.model.optimizer.name,
base_learning_rate=learning_rate,
params=params.model.optimizer.as_dict(),
model=model)
optimizer = performance.configure_optimizer(
optimizer,
use_float16=train_builder.dtype == 'float16',
loss_scale=get_loss_scale(params))
metrics_map = _get_metrics(one_hot)
metrics = [metrics_map[metric] for metric in params.train.metrics]
steps_per_loop = train_steps if params.train.set_epoch_loop else 1
if one_hot:
loss_obj = tf.keras.losses.CategoricalCrossentropy(
label_smoothing=params.model.loss.label_smoothing)
else:
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(
optimizer=optimizer,
loss=loss_obj,
metrics=metrics,
steps_per_execution=steps_per_loop)
initial_epoch = 0
if params.train.resume_checkpoint:
initial_epoch = resume_from_checkpoint(
model=model, model_dir=params.model_dir, train_steps=train_steps)
callbacks = custom_callbacks.get_callbacks(
model_checkpoint=params.train.callbacks.enable_checkpoint_and_export,
include_tensorboard=params.train.callbacks.enable_tensorboard,
time_history=params.train.callbacks.enable_time_history,
track_lr=params.train.tensorboard.track_lr,
write_model_weights=params.train.tensorboard.write_model_weights,
initial_step=initial_epoch * train_steps,
batch_size=train_builder.global_batch_size,
log_steps=params.train.time_history.log_steps,
model_dir=params.model_dir,
backup_and_restore=params.train.callbacks.enable_backup_and_restore)
serialize_config(params=params, model_dir=params.model_dir)
if params.evaluation.skip_eval:
validation_kwargs = {}
else:
validation_kwargs = {
'validation_data': validation_dataset,
'validation_steps': validation_steps,
'validation_freq': params.evaluation.epochs_between_evals,
}
history = model.fit(
train_dataset,
epochs=train_epochs,
steps_per_epoch=train_steps,
initial_epoch=initial_epoch,
callbacks=callbacks,
verbose=2,
**validation_kwargs)
validation_output = None
if not params.evaluation.skip_eval:
validation_output = model.evaluate(
validation_dataset, steps=validation_steps, verbose=2)
# TODO(dankondratyuk): eval and save final test accuracy
stats = common.build_stats(history, validation_output, callbacks)
return stats
def export(params: base_configs.ExperimentConfig):
"""Runs the model export functionality."""
logging.info('Exporting model.')
model_params = params.model.model_params.as_dict()
model = get_models()[params.model.name](**model_params)
checkpoint = params.export.checkpoint
if checkpoint is None:
logging.info('No export checkpoint was provided. Using the latest '
'checkpoint from model_dir.')
checkpoint = tf.train.latest_checkpoint(params.model_dir)
model.load_weights(checkpoint)
model.save(params.export.destination)
def run(flags_obj: flags.FlagValues,
strategy_override: tf.distribute.Strategy = None) -> Mapping[str, Any]:
"""Runs Image Classification model using native Keras APIs.
Args:
flags_obj: An object containing parsed flag values.
strategy_override: A `tf.distribute.Strategy` object to use for model.
Returns:
Dictionary of training/eval stats
"""
params = _get_params_from_flags(flags_obj)
if params.mode == 'train_and_eval':
return train_and_eval(params, strategy_override)
elif params.mode == 'export_only':
export(params)
else:
raise ValueError('{} is not a valid mode.'.format(params.mode))
def main(_):
stats = run(flags.FLAGS)
if stats:
logging.info('Run stats:\n%s', stats)
if __name__ == '__main__':
logging.set_verbosity(logging.INFO)
define_classifier_flags()
flags.mark_flag_as_required('data_dir')
flags.mark_flag_as_required('mode')
flags.mark_flag_as_required('model_type')
flags.mark_flag_as_required('dataset')
app.run(main)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment