Commit 2412b118 authored by Gunho Park's avatar Gunho Park
Browse files

Merge branch 'master' of https://github.com/tensorflow/models

parents f7783e7a 6dbdb08c
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for panoptic deeplab config."""
# pylint: disable=unused-import
from absl.testing import parameterized
import tensorflow as tf
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.vision.beta.projects.panoptic_maskrcnn.configs import panoptic_deeplab as exp_cfg
class PanopticMaskRCNNConfigTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
('panoptic_deeplab_resnet_coco', 'dilated_resnet'),
('panoptic_deeplab_mobilenetv3_large_coco', 'mobilenet'),
)
def test_panoptic_deeplab_configs(self, config_name, backbone_type):
config = exp_factory.get_exp_config(config_name)
self.assertIsInstance(config, cfg.ExperimentConfig)
self.assertIsInstance(config.task, exp_cfg.PanopticDeeplabTask)
self.assertIsInstance(config.task.model, exp_cfg.PanopticDeeplab)
self.assertIsInstance(config.task.train_data, exp_cfg.DataConfig)
self.assertEqual(config.task.model.backbone.type, backbone_type)
config.validate()
config.task.train_data.is_training = None
with self.assertRaisesRegex(KeyError, 'Found inconsistncy between key'):
config.validate()
if __name__ == '__main__':
tf.test.main()
# MobileNetV3-large_1.0 ImageNet classification: ~75.3% top-1.
# MobileNetV3-large_1.0 ImageNet classification: ~75.7% top-1.
runtime:
distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16'
......@@ -27,10 +27,10 @@ task:
dtype: 'bfloat16'
aug_type:
autoaug:
augmentation_name: v0
augmentation_name: 'v0'
cutout_const: 100
translate_const: 250
type: autoaug
type: 'autoaug'
validation_data:
input_path: 'imagenet-2012-tfrecord/valid*'
is_training: false
......@@ -38,7 +38,7 @@ task:
dtype: 'bfloat16'
drop_remainder: false
trainer:
train_steps: 156000 # 500 epochs
train_steps: 218000 # 700 epochs
validation_steps: 13
validation_interval: 312
steps_per_loop: 312 # NUM_EXAMPLES (1281167) // global_batch_size
......@@ -48,7 +48,7 @@ trainer:
learning_rate:
cosine:
alpha: 0.0
decay_steps: 156000
decay_steps: 218000
initial_learning_rate: 0.004
name: CosineDecay
offset: 0
......
# MobileNetV3Small ImageNet classification. 67.5% top-1 and 87.6% top-5 accuracy.
# MobileNetV3Small ImageNet classification. 67.5% top-1 and 87.7% top-5 accuracy.
runtime:
distribution_strategy: 'tpu'
mixed_precision_dtype: 'bfloat16'
......@@ -34,7 +34,7 @@ task:
drop_remainder: false
trainer:
train_steps: 312000 # 1000 epochs
validation_steps: 12
validation_steps: 13
validation_interval: 312
steps_per_loop: 312 # NUM_EXAMPLES (1281167) // global_batch_size
summary_interval: 312
......@@ -49,7 +49,7 @@ trainer:
learning_rate:
type: 'exponential'
exponential:
initial_learning_rate: 0.01
initial_learning_rate: 0.426 # 0.02 * (batch_size / 192)
decay_steps: 936 # 3 * steps_per_epoch
decay_rate: 0.99
staircase: true
......@@ -60,4 +60,4 @@ trainer:
type: 'linear'
linear:
warmup_steps: 1560
warmup_learning_rate: 0.001
warmup_learning_rate: 0.0
......@@ -107,6 +107,7 @@ class RetinaNetHead(hyperparams.Config):
num_filters: int = 256
use_separable_conv: bool = False
attribute_heads: List[AttributeHead] = dataclasses.field(default_factory=list)
share_classification_heads: bool = False
@dataclasses.dataclass
......
......@@ -254,6 +254,11 @@ class Parser(parser.Parser):
return image
def parse_train_image(self, decoded_tensors: Dict[str,
tf.Tensor]) -> tf.Tensor:
"""Public interface for parsing image data for training."""
return self._parse_train_image(decoded_tensors)
@classmethod
def inference_fn(cls,
image: tf.Tensor,
......
......@@ -293,6 +293,7 @@ def build_retinanet(
attribute_heads=[
cfg.as_dict() for cfg in (head_config.attribute_heads or [])
],
share_classification_heads=head_config.share_classification_heads,
use_separable_conv=head_config.use_separable_conv,
activation=norm_activation_config.activation,
use_sync_bn=norm_activation_config.use_sync_bn,
......
......@@ -37,6 +37,7 @@ class RetinaNetHead(tf.keras.layers.Layer):
num_convs: int = 4,
num_filters: int = 256,
attribute_heads: Optional[List[Dict[str, Any]]] = None,
share_classification_heads: bool = False,
use_separable_conv: bool = False,
activation: str = 'relu',
use_sync_bn: bool = False,
......@@ -62,6 +63,8 @@ class RetinaNetHead(tf.keras.layers.Layer):
additional attribute head. Each dict consists of 3 key-value pairs:
`name`, `type` ('regression' or 'classification'), and `size` (number
of predicted values for each instance).
share_classification_heads: A `bool` that indicates whethere
sharing weights among the main and attribute classification heads.
use_separable_conv: A `bool` that indicates whether the separable
convolution layers is used.
activation: A `str` that indicates which activation is used, e.g. 'relu',
......@@ -88,6 +91,7 @@ class RetinaNetHead(tf.keras.layers.Layer):
'num_convs': num_convs,
'num_filters': num_filters,
'attribute_heads': attribute_heads,
'share_classification_heads': share_classification_heads,
'use_separable_conv': use_separable_conv,
'activation': activation,
'use_sync_bn': use_sync_bn,
......@@ -216,7 +220,11 @@ class RetinaNetHead(tf.keras.layers.Layer):
this_level_att_norms = []
for i in range(self._config_dict['num_convs']):
if level == self._config_dict['min_level']:
att_conv_name = '{}-conv_{}'.format(att_name, i)
if self._config_dict[
'share_classification_heads'] and att_type == 'classification':
att_conv_name = 'classnet-conv_{}'.format(i)
else:
att_conv_name = '{}-conv_{}'.format(att_name, i)
if 'kernel_initializer' in conv_kwargs:
conv_kwargs['kernel_initializer'] = tf_utils.clone_initializer(
conv_kwargs['kernel_initializer'])
......
......@@ -25,14 +25,15 @@ from official.vision.modeling.heads import dense_prediction_heads
class RetinaNetHeadTest(parameterized.TestCase, tf.test.TestCase):
@parameterized.parameters(
(False, False, False),
(False, True, False),
(True, False, True),
(True, True, True),
(False, False, False, None, False),
(False, True, False, None, False),
(True, False, True, 'regression', False),
(True, True, True, 'classification', True),
)
def test_forward(self, use_separable_conv, use_sync_bn, has_att_heads):
def test_forward(self, use_separable_conv, use_sync_bn, has_att_heads,
att_type, share_classification_heads):
if has_att_heads:
attribute_heads = [dict(name='depth', type='regression', size=1)]
attribute_heads = [dict(name='depth', type=att_type, size=1)]
else:
attribute_heads = None
......@@ -44,6 +45,7 @@ class RetinaNetHeadTest(parameterized.TestCase, tf.test.TestCase):
num_convs=2,
num_filters=256,
attribute_heads=attribute_heads,
share_classification_heads=share_classification_heads,
use_separable_conv=use_separable_conv,
activation='relu',
use_sync_bn=use_sync_bn,
......
......@@ -158,7 +158,7 @@ class MaskRCNNModel(tf.keras.Model):
matched_gt_classes=intermediate_outputs['matched_gt_classes'],
gt_masks=gt_masks,
training=training)
model_outputs.update(model_mask_outputs)
model_outputs.update(model_mask_outputs) # pytype: disable=attribute-error # dynamic-method-lookup
return model_outputs
def _get_backbone_and_decoder_features(self, images):
......
......@@ -638,6 +638,53 @@ def random_horizontal_flip(image, normalized_boxes=None, masks=None, seed=1):
return image, normalized_boxes, masks
def random_horizontal_flip_with_roi(
image: tf.Tensor,
boxes: Optional[tf.Tensor] = None,
masks: Optional[tf.Tensor] = None,
roi_boxes: Optional[tf.Tensor] = None,
seed: int = 1
) -> Tuple[tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor],
Optional[tf.Tensor]]:
"""Randomly flips input image and bounding boxes.
Extends preprocess_ops.random_horizontal_flip to also flip roi_boxes used
by ViLD.
Args:
image: `tf.Tensor`, the image to apply the random flip.
boxes: `tf.Tensor` or `None`, boxes corresponding to the image.
masks: `tf.Tensor` or `None`, masks corresponding to the image.
roi_boxes: `tf.Tensor` or `None`, RoIs corresponding to the image.
seed: Seed for Tensorflow's random number generator.
Returns:
image: `tf.Tensor`, flipped image.
boxes: `tf.Tensor` or `None`, flipped boxes corresponding to the image.
masks: `tf.Tensor` or `None`, flipped masks corresponding to the image.
roi_boxes: `tf.Tensor` or `None`, flipped RoIs corresponding to the image.
"""
with tf.name_scope('random_horizontal_flip'):
do_flip = tf.greater(tf.random.uniform([], seed=seed), 0.5)
image = tf.cond(do_flip, lambda: horizontal_flip_image(image),
lambda: image)
if boxes is not None:
boxes = tf.cond(do_flip, lambda: horizontal_flip_boxes(boxes),
lambda: boxes)
if masks is not None:
masks = tf.cond(do_flip, lambda: horizontal_flip_masks(masks),
lambda: masks)
if roi_boxes is not None:
roi_boxes = tf.cond(do_flip, lambda: horizontal_flip_boxes(roi_boxes),
lambda: roi_boxes)
return image, boxes, masks, roi_boxes
def color_jitter(image: tf.Tensor,
brightness: Optional[float] = 0.,
contrast: Optional[float] = 0.,
......
......@@ -18,8 +18,7 @@ from typing import Optional, Tuple
import tensorflow as tf
def _sample_or_pad_sequence_indices(sequence: tf.Tensor,
num_steps: int,
def _sample_or_pad_sequence_indices(sequence: tf.Tensor, num_steps: int,
stride: int,
offset: tf.Tensor) -> tf.Tensor:
"""Returns indices to take for sampling or padding sequences to fixed size."""
......@@ -28,18 +27,16 @@ def _sample_or_pad_sequence_indices(sequence: tf.Tensor,
# Repeats sequence until num_steps are available in total.
max_length = num_steps * stride + offset
num_repeats = tf.math.floordiv(
max_length + sequence_length - 1, sequence_length)
num_repeats = tf.math.floordiv(max_length + sequence_length - 1,
sequence_length)
sel_idx = tf.tile(sel_idx, [num_repeats])
steps = tf.range(offset, offset + num_steps * stride, stride)
return tf.gather(sel_idx, steps)
def sample_linspace_sequence(sequence: tf.Tensor,
num_windows: int,
num_steps: int,
stride: int) -> tf.Tensor:
def sample_linspace_sequence(sequence: tf.Tensor, num_windows: int,
num_steps: int, stride: int) -> tf.Tensor:
"""Samples `num_windows` segments from sequence with linearly spaced offsets.
The samples are concatenated in a single `tf.Tensor` in order to have the same
......@@ -66,11 +63,12 @@ def sample_linspace_sequence(sequence: tf.Tensor,
all_indices = []
for i in range(num_windows):
all_indices.append(_sample_or_pad_sequence_indices(
sequence=sequence,
num_steps=num_steps,
stride=stride,
offset=offsets[i]))
all_indices.append(
_sample_or_pad_sequence_indices(
sequence=sequence,
num_steps=num_steps,
stride=stride,
offset=offsets[i]))
indices = tf.concat(all_indices, axis=0)
indices.set_shape((num_windows * num_steps,))
......@@ -110,25 +108,76 @@ def sample_sequence(sequence: tf.Tensor,
sequence_length > (num_steps - 1) * frame_stride,
lambda: sequence_length - (num_steps - 1) * frame_stride,
lambda: sequence_length)
offset = tf.random.uniform(
(),
maxval=tf.cast(max_offset, dtype=tf.int32),
dtype=tf.int32,
seed=seed)
offset = tf.random.uniform((),
maxval=tf.cast(max_offset, dtype=tf.int32),
dtype=tf.int32,
seed=seed)
else:
offset = (sequence_length - num_steps * stride) // 2
offset = tf.maximum(0, offset)
indices = _sample_or_pad_sequence_indices(
sequence=sequence,
num_steps=num_steps,
stride=stride,
offset=offset)
sequence=sequence, num_steps=num_steps, stride=stride, offset=offset)
indices.set_shape((num_steps,))
return tf.gather(sequence, indices)
def sample_segment_sequence(sequence: tf.Tensor,
num_frames: int,
is_training: bool,
seed: Optional[int] = None) -> tf.Tensor:
"""Samples a single segment of size `num_frames` from a given sequence.
This function follows the temporal segment network sampling style
(https://arxiv.org/abs/1608.00859). The video sequence would be divided into
`num_frames` non-overlapping segments with same length. If `is_training` is
`True`, we would randomly sampling one frame for each segment, and when
`is_training` is `False`, only the center frame of each segment is sampled.
Args:
sequence: Any tensor where the first dimension is timesteps.
num_frames: Number of frames to take.
is_training: A boolean indicating sampling in training or evaluation mode.
seed: A deterministic seed to use when sampling.
Returns:
A single `tf.Tensor` with first dimension `num_steps` with the sampled
segment.
"""
sequence_length = tf.shape(sequence)[0]
sequence_length = tf.cast(sequence_length, tf.float32)
segment_length = tf.cast(sequence_length // num_frames, tf.float32)
segment_indices = tf.linspace(0.0, sequence_length, num_frames + 1)
segment_indices = tf.cast(segment_indices, tf.int32)
if is_training:
segment_length = tf.cast(segment_length, tf.int32)
# pylint:disable=g-long-lambda
segment_offsets = tf.cond(
segment_length == 0,
lambda: tf.zeros(shape=(num_frames,), dtype=tf.int32),
lambda: tf.random.uniform(
shape=(num_frames,),
minval=0,
maxval=segment_length,
dtype=tf.int32,
seed=seed))
# pylint:disable=g-long-lambda
else:
# Only sampling central frame during inference for being deterministic.
segment_offsets = tf.ones(
shape=(num_frames,), dtype=tf.int32) * tf.cast(
segment_length // 2, dtype=tf.int32)
indices = segment_indices[:-1] + segment_offsets
indices.set_shape((num_frames,))
return tf.gather(sequence, indices)
def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
"""Decodes JPEG raw bytes string into a RGB uint8 Tensor.
......@@ -144,7 +193,9 @@ def decode_jpeg(image_string: tf.Tensor, channels: int = 0) -> tf.Tensor:
"""
return tf.map_fn(
lambda x: tf.image.decode_jpeg(x, channels=channels),
image_string, back_prop=False, dtype=tf.uint8)
image_string,
back_prop=False,
dtype=tf.uint8)
def crop_image(frames: tf.Tensor,
......@@ -229,8 +280,7 @@ def crop_image(frames: tf.Tensor,
return frames
def resize_smallest(frames: tf.Tensor,
min_resize: int) -> tf.Tensor:
def resize_smallest(frames: tf.Tensor, min_resize: int) -> tf.Tensor:
"""Resizes frames so that min(`height`, `width`) is equal to `min_resize`.
This function will not do anything if the min(`height`, `width`) is already
......@@ -255,18 +305,15 @@ def resize_smallest(frames: tf.Tensor,
frames_resized = tf.image.resize(frames, (output_h, output_w))
return tf.cast(frames_resized, frames.dtype)
should_resize = tf.math.logical_or(tf.not_equal(input_w, output_w),
tf.not_equal(input_h, output_h))
should_resize = tf.math.logical_or(
tf.not_equal(input_w, output_w), tf.not_equal(input_h, output_h))
frames = tf.cond(should_resize, resize_fn, lambda: frames)
return frames
def random_crop_resize(frames: tf.Tensor,
output_h: int,
output_w: int,
num_frames: int,
num_channels: int,
def random_crop_resize(frames: tf.Tensor, output_h: int, output_w: int,
num_frames: int, num_channels: int,
aspect_ratio: Tuple[float, float],
area_range: Tuple[float, float]) -> tf.Tensor:
"""First crops clip with jittering and then resizes to (output_h, output_w).
......@@ -279,6 +326,7 @@ def random_crop_resize(frames: tf.Tensor,
num_channels: Number of channels of the clip.
aspect_ratio: Float tuple with the aspect range for cropping.
area_range: Float tuple with the area range for cropping.
Returns:
A Tensor of shape [timesteps, output_h, output_w, channels] of type
frames.dtype.
......@@ -299,21 +347,16 @@ def random_crop_resize(frames: tf.Tensor,
bbox_begin, bbox_size, _ = sample_distorted_bbox
offset_y, offset_x, _ = tf.unstack(bbox_begin)
target_height, target_width, _ = tf.unstack(bbox_size)
size = tf.convert_to_tensor((
seq_len, target_height, target_width, channels))
offset = tf.convert_to_tensor((
0, offset_y, offset_x, 0))
size = tf.convert_to_tensor((seq_len, target_height, target_width, channels))
offset = tf.convert_to_tensor((0, offset_y, offset_x, 0))
frames = tf.slice(frames, offset, size)
frames = tf.cast(
tf.image.resize(frames, (output_h, output_w)),
frames.dtype)
frames = tf.cast(tf.image.resize(frames, (output_h, output_w)), frames.dtype)
frames.set_shape((num_frames, output_h, output_w, num_channels))
return frames
def random_flip_left_right(
frames: tf.Tensor,
seed: Optional[int] = None) -> tf.Tensor:
def random_flip_left_right(frames: tf.Tensor,
seed: Optional[int] = None) -> tf.Tensor:
"""Flips all the frames with a probability of 50%.
Args:
......@@ -324,12 +367,16 @@ def random_flip_left_right(
A Tensor of shape [timesteps, output_h, output_w, channels] eventually
flipped left right.
"""
is_flipped = tf.random.uniform(
(), minval=0, maxval=2, dtype=tf.int32, seed=seed)
frames = tf.cond(tf.equal(is_flipped, 1),
true_fn=lambda: tf.image.flip_left_right(frames),
false_fn=lambda: frames)
is_flipped = tf.random.uniform((),
minval=0,
maxval=2,
dtype=tf.int32,
seed=seed)
frames = tf.cond(
tf.equal(is_flipped, 1),
true_fn=lambda: tf.image.flip_left_right(frames),
false_fn=lambda: frames)
return frames
......
......@@ -72,6 +72,16 @@ class ParserUtilsTest(tf.test.TestCase):
self.assertBetween(offset_3, 0, 99)
self.assertAllEqual(sampled_seq_3, range(offset_3, offset_3 + 10))
def test_sample_segment_sequence(self):
sequence = tf.range(100)
sampled_seq_1 = preprocess_ops_3d.sample_segment_sequence(
sequence, 10, False)
sampled_seq_2 = preprocess_ops_3d.sample_segment_sequence(
sequence, 10, True)
self.assertAllEqual(sampled_seq_1, [5 + i * 10 for i in range(10)])
for idx, v in enumerate(sampled_seq_2):
self.assertBetween(v - idx * 10, 0, 10)
def test_decode_jpeg(self):
# Create a random RGB JPEG image.
random_image = np.random.randint(0, 256, size=(263, 320, 3), dtype=np.uint8)
......
......@@ -4235,6 +4235,15 @@ class CenterNetMetaArch(model.DetectionModel):
axis=-2)
multiclass_scores = postprocess_dict[
fields.DetectionResultFields.detection_multiclass_scores]
num_classes = tf.shape(multiclass_scores)[2]
class_mask = tf.cast(
tf.one_hot(
postprocess_dict[fields.DetectionResultFields.detection_classes],
depth=num_classes), tf.bool)
# Surpress the scores of those unselected classes to be zeros. Otherwise,
# the downstream NMS ops might be confused and introduce issues.
multiclass_scores = tf.where(
class_mask, multiclass_scores, tf.zeros_like(multiclass_scores))
num_valid_boxes = postprocess_dict.pop(
fields.DetectionResultFields.num_detections)
# Remove scores and classes as NMS will compute these form multiclass
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment