"vscode:/vscode.git/clone" did not exist on "de082f141c16992bb8584996938396d8ebcd1ac7"
Commit d55ee951 authored by Frederick Liu's avatar Frederick Liu Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 424422082
parent 2fc25efc
# End-to-End Object Detection with Transformers (DETR)
[![DETR](https://img.shields.io/badge/DETR-arXiv.2005.12872-B3181B?)](https://arxiv.org/abs/2005.12872).
TensorFlow 2 implementation of End-to-End Object Detection with Transformers
⚠️ Disclaimer: All datasets hyperlinked from this page are not owned or
distributed by Google. The dataset is made available by third parties.
Please review the terms and conditions made available by the third parties
before using the data.
## Scripts:
You can find the scripts to reproduce the following experiments in
detr/experiments.
## DETR [COCO](https://cocodataset.org) ([ImageNet](https://www.image-net.org) pretrained)
| Model | Resolution | Batch size | Epochs | Decay@ | Params (M) | Box AP | Dashboard | Checkpoint | Experiment |
| --------- | :--------: | ----------:| ------:| -----: | ---------: | -----: | --------: | ---------: | ---------: |
| DETR-ResNet-50 | 1333x1333 |64|300| 200 |41 | 40.6 | [tensorboard](https://tensorboard.dev/experiment/o2IEZnniRYu6pqViBeopIg/#scalars) | [ckpt](https://storage.googleapis.com/tf_model_garden/vision/detr/detr_resnet_50_300.tar.gz) | detr_r50_300epochs.sh |
| DETR-ResNet-50 | 1333x1333 |64|500| 400 |41 | 42.0| [tensorboard](https://tensorboard.dev/experiment/YFMDKpESR4yjocPh5HgfRw/) | [ckpt](https://storage.googleapis.com/tf_model_garden/vision/detr/detr_resnet_50_500.tar.gz) | detr_r50_500epochs.sh |
| DETR-ResNet-50 | 1333x1333 |64|300| 200 |41 | 40.6 | paper | NA | NA |
| DETR-ResNet-50 | 1333x1333 |64|500| 400 |41 | 42.0 | paper | NA | NA |
| DETR-DC5-ResNet-50 | 1333x1333 |64|500| 400 |41 | 43.3 | paper | NA | NA |
## Need contribution:
* Add DC5 support and update experiment table.
## Citing TensorFlow Model Garden
If you find this codebase helpful in your research, please cite this repository.
```
@misc{tensorflowmodelgarden2020,
author = {Hongkun Yu and Chen Chen and Xianzhi Du and Yeqing Li and
Abdullah Rashwan and Le Hou and Pengchong Jin and Fan Yang and
Frederick Liu and Jaeyoun Kim and Jing Li},
title = {{TensorFlow Model Garden}},
howpublished = {\url{https://github.com/tensorflow/models}},
year = {2020}
}
```
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DETR configurations."""
import dataclasses
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.projects.detr import optimization
from official.projects.detr.dataloaders import coco
@dataclasses.dataclass
class DetectionConfig(cfg.TaskConfig):
"""The translation task config."""
train_data: cfg.DataConfig = cfg.DataConfig()
validation_data: cfg.DataConfig = cfg.DataConfig()
lambda_cls: float = 1.0
lambda_box: float = 5.0
lambda_giou: float = 2.0
init_ckpt: str = ''
num_classes: int = 81 # 0: background
background_cls_weight: float = 0.1
num_encoder_layers: int = 6
num_decoder_layers: int = 6
# Make DETRConfig.
num_queries: int = 100
num_hidden: int = 256
per_category_metrics: bool = False
@exp_factory.register_config_factory('detr_coco')
def detr_coco() -> cfg.ExperimentConfig:
"""Config to get results that matches the paper."""
train_batch_size = 64
eval_batch_size = 64
num_train_data = 118287
num_steps_per_epoch = num_train_data // train_batch_size
train_steps = 500 * num_steps_per_epoch # 500 epochs
decay_at = train_steps - 100 * num_steps_per_epoch # 400 epochs
config = cfg.ExperimentConfig(
task=DetectionConfig(
train_data=coco.COCODataConfig(
tfds_name='coco/2017',
tfds_split='train',
is_training=True,
global_batch_size=train_batch_size,
shuffle_buffer_size=1000,
),
validation_data=coco.COCODataConfig(
tfds_name='coco/2017',
tfds_split='validation',
is_training=False,
global_batch_size=eval_batch_size,
drop_remainder=False
)
),
trainer=cfg.TrainerConfig(
train_steps=train_steps,
validation_steps=-1,
steps_per_loop=10000,
summary_interval=10000,
checkpoint_interval=10000,
validation_interval=10000,
max_to_keep=1,
best_checkpoint_export_subdir='best_ckpt',
best_checkpoint_eval_metric='AP',
optimizer_config=optimization.OptimizationConfig({
'optimizer': {
'type': 'detr_adamw',
'detr_adamw': {
'weight_decay_rate': 1e-4,
'global_clipnorm': 0.1,
# Avoid AdamW legacy behavior.
'gradient_clip_norm': 0.0
}
},
'learning_rate': {
'type': 'stepwise',
'stepwise': {
'boundaries': [decay_at],
'values': [0.0001, 1.0e-05]
}
},
})
),
restrictions=[
'task.train_data.is_training != None',
])
return config
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for detr."""
# pylint: disable=unused-import
from absl.testing import parameterized
import tensorflow as tf
from official.core import config_definitions as cfg
from official.core import exp_factory
from official.projects.detr.configs import detr as exp_cfg
from official.projects.detr.dataloaders import coco
class DetrTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(('detr_coco',))
def test_detr_configs(self, config_name):
config = exp_factory.get_exp_config(config_name)
self.assertIsInstance(config, cfg.ExperimentConfig)
self.assertIsInstance(config.task, exp_cfg.DetectionConfig)
self.assertIsInstance(config.task.train_data, coco.COCODataConfig)
config.task.train_data.is_training = None
with self.assertRaises(KeyError):
config.validate()
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""COCO data loader for DETR."""
import dataclasses
from typing import Optional, Tuple
import tensorflow as tf
from official.core import config_definitions as cfg
from official.core import input_reader
from official.vision.beta.ops import box_ops
from official.vision.beta.ops import preprocess_ops
@dataclasses.dataclass
class COCODataConfig(cfg.DataConfig):
"""Data config for COCO."""
output_size: Tuple[int, int] = (1333, 1333)
max_num_boxes: int = 100
resize_scales: Tuple[int, ...] = (
480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
class COCODataLoader():
"""A class to load dataset for COCO detection task."""
def __init__(self, params: COCODataConfig):
self._params = params
def preprocess(self, inputs):
"""Preprocess COCO for DETR."""
image = inputs['image']
boxes = inputs['objects']['bbox']
classes = inputs['objects']['label'] + 1
is_crowd = inputs['objects']['is_crowd']
image = preprocess_ops.normalize_image(image)
if self._params.is_training:
image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)
do_crop = tf.greater(tf.random.uniform([]), 0.5)
if do_crop:
# Rescale
boxes = box_ops.denormalize_boxes(boxes, tf.shape(image)[:2])
index = tf.random.categorical(tf.zeros([1, 3]), 1)[0]
scales = tf.gather([400.0, 500.0, 600.0], index, axis=0)
short_side = scales[0]
image, image_info = preprocess_ops.resize_image(image, short_side)
boxes = preprocess_ops.resize_and_crop_boxes(boxes,
image_info[2, :],
image_info[1, :],
image_info[3, :])
boxes = box_ops.normalize_boxes(boxes, image_info[1, :])
# Do croping
shape = tf.cast(image_info[1], dtype=tf.int32)
h = tf.random.uniform(
[], 384, tf.math.minimum(shape[0], 600), dtype=tf.int32)
w = tf.random.uniform(
[], 384, tf.math.minimum(shape[1], 600), dtype=tf.int32)
i = tf.random.uniform([], 0, shape[0] - h + 1, dtype=tf.int32)
j = tf.random.uniform([], 0, shape[1] - w + 1, dtype=tf.int32)
image = tf.image.crop_to_bounding_box(image, i, j, h, w)
boxes = tf.clip_by_value(
(boxes[..., :] * tf.cast(
tf.stack([shape[0], shape[1], shape[0], shape[1]]),
dtype=tf.float32) -
tf.cast(tf.stack([i, j, i, j]), dtype=tf.float32)) /
tf.cast(tf.stack([h, w, h, w]), dtype=tf.float32), 0.0, 1.0)
scales = tf.constant(
self._params.resize_scales,
dtype=tf.float32)
index = tf.random.categorical(tf.zeros([1, 11]), 1)[0]
scales = tf.gather(scales, index, axis=0)
else:
scales = tf.constant([self._params.resize_scales[-1]], tf.float32)
image_shape = tf.shape(image)[:2]
boxes = box_ops.denormalize_boxes(boxes, image_shape)
gt_boxes = boxes
short_side = scales[0]
image, image_info = preprocess_ops.resize_image(
image,
short_side,
max(self._params.output_size))
boxes = preprocess_ops.resize_and_crop_boxes(boxes,
image_info[2, :],
image_info[1, :],
image_info[3, :])
boxes = box_ops.normalize_boxes(boxes, image_info[1, :])
# Filters out ground truth boxes that are all zeros.
indices = box_ops.get_non_empty_box_indices(boxes)
boxes = tf.gather(boxes, indices)
classes = tf.gather(classes, indices)
is_crowd = tf.gather(is_crowd, indices)
boxes = box_ops.yxyx_to_cycxhw(boxes)
image = tf.image.pad_to_bounding_box(
image, 0, 0, self._params.output_size[0], self._params.output_size[1])
labels = {
'classes':
preprocess_ops.clip_or_pad_to_fixed_size(
classes, self._params.max_num_boxes),
'boxes':
preprocess_ops.clip_or_pad_to_fixed_size(
boxes, self._params.max_num_boxes)
}
if not self._params.is_training:
labels.update({
'id':
inputs['image/id'],
'image_info':
image_info,
'is_crowd':
preprocess_ops.clip_or_pad_to_fixed_size(
is_crowd, self._params.max_num_boxes),
'gt_boxes':
preprocess_ops.clip_or_pad_to_fixed_size(
gt_boxes, self._params.max_num_boxes),
})
return image, labels
def _transform_and_batch_fn(
self,
dataset,
input_context: Optional[tf.distribute.InputContext] = None):
"""Preprocess and batch."""
dataset = dataset.map(
self.preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
per_replica_batch_size = input_context.get_per_replica_batch_size(
self._params.global_batch_size
) if input_context else self._params.global_batch_size
dataset = dataset.batch(
per_replica_batch_size, drop_remainder=self._params.is_training)
return dataset
def load(self, input_context: Optional[tf.distribute.InputContext] = None):
"""Returns a tf.dataset.Dataset."""
reader = input_reader.InputReader(
params=self._params,
decoder_fn=None,
transform_and_batch_fn=self._transform_and_batch_fn)
return reader.read(input_context)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tensorflow_models.official.projects.detr.dataloaders.coco."""
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from official.projects.detr.dataloaders import coco
def _gen_fn():
h = np.random.randint(0, 300)
w = np.random.randint(0, 300)
num_boxes = np.random.randint(0, 50)
return {
'image': np.ones(shape=(h, w, 3), dtype=np.uint8),
'image/id': np.random.randint(0, 100),
'image/filename': 'test',
'objects': {
'is_crowd': np.ones(shape=(num_boxes), dtype=np.bool),
'bbox': np.ones(shape=(num_boxes, 4), dtype=np.float32),
'label': np.ones(shape=(num_boxes), dtype=np.int64),
'id': np.ones(shape=(num_boxes), dtype=np.int64),
'area': np.ones(shape=(num_boxes), dtype=np.int64),
}
}
class CocoDataloaderTest(tf.test.TestCase, parameterized.TestCase):
def test_load_dataset(self):
output_size = 1280
max_num_boxes = 100
batch_size = 2
data_config = coco.COCODataConfig(
tfds_name='coco/2017',
tfds_split='validation',
is_training=False,
global_batch_size=batch_size,
output_size=(output_size, output_size),
max_num_boxes=max_num_boxes,
)
num_examples = 10
def as_dataset(self, *args, **kwargs):
del args
del kwargs
return tf.data.Dataset.from_generator(
lambda: (_gen_fn() for i in range(num_examples)),
output_types=self.info.features.dtype,
output_shapes=self.info.features.shape,
)
with tfds.testing.mock_data(num_examples=num_examples,
as_dataset_fn=as_dataset):
dataset = coco.COCODataLoader(data_config).load()
dataset_iter = iter(dataset)
images, labels = next(dataset_iter)
self.assertEqual(images.shape, (batch_size, output_size, output_size, 3))
self.assertEqual(labels['classes'].shape, (batch_size, max_num_boxes))
self.assertEqual(labels['boxes'].shape, (batch_size, max_num_boxes, 4))
self.assertEqual(labels['id'].shape, (batch_size,))
self.assertEqual(
labels['image_info'].shape, (batch_size, 4, 2))
self.assertEqual(labels['is_crowd'].shape, (batch_size, max_num_boxes))
@parameterized.named_parameters(
('training', True),
('validation', False))
def test_preprocess(self, is_training):
output_size = 1280
max_num_boxes = 100
batch_size = 2
data_config = coco.COCODataConfig(
tfds_name='coco/2017',
tfds_split='validation',
is_training=is_training,
global_batch_size=batch_size,
output_size=(output_size, output_size),
max_num_boxes=max_num_boxes,
)
dl = coco.COCODataLoader(data_config)
inputs = _gen_fn()
image, label = dl.preprocess(inputs)
self.assertEqual(image.shape, (output_size, output_size, 3))
self.assertEqual(label['classes'].shape, (max_num_boxes))
self.assertEqual(label['boxes'].shape, (max_num_boxes, 4))
if not is_training:
self.assertDTypeEqual(label['id'], int)
self.assertEqual(
label['image_info'].shape, (4, 2))
self.assertEqual(label['is_crowd'].shape, (max_num_boxes))
if __name__ == '__main__':
tf.test.main()
#!/bin/bash
python3 official/projects/detr/train.py \
--experiment=detr_coco \
--mode=train_and_eval \
--model_dir=/tmp/logging_dir/ \
--params_override=task.init_ckpt='gs://tf_model_garden/vision/resnet50_imagenet/ckpt-62400',trainer.train_steps=554400
#!/bin/bash
python3 official/projects/detr/train.py \
--experiment=detr_coco \
--mode=train_and_eval \
--model_dir=/tmp/logging_dir/ \
--params_override=task.init_ckpt='gs://tf_model_garden/vision/resnet50_imagenet/ckpt-62400'
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Implements End-to-End Object Detection with Transformers.
Model paper: https://arxiv.org/abs/2005.12872
This module does not support Keras de/serialization. Please use
tf.train.Checkpoint for object based saving and loading and tf.saved_model.save
for graph serializaiton.
"""
import math
import tensorflow as tf
from official.modeling import tf_utils
from official.projects.detr.modeling import transformer
from official.vision.beta.modeling.backbones import resnet
def position_embedding_sine(attention_mask,
num_pos_features=256,
temperature=10000.,
normalize=True,
scale=2 * math.pi):
"""Sine-based positional embeddings for 2D images.
Args:
attention_mask: a `bool` Tensor specifying the size of the input image to
the Transformer and which elements are padded, of size [batch_size,
height, width]
num_pos_features: a `int` specifying the number of positional features,
should be equal to the hidden size of the Transformer network
temperature: a `float` specifying the temperature of the positional
embedding. Any type that is converted to a `float` can also be accepted.
normalize: a `bool` determining whether the positional embeddings should be
normalized between [0, scale] before application of the sine and cos
functions.
scale: a `float` if normalize is True specifying the scale embeddings before
application of the embedding function.
Returns:
embeddings: a `float` tensor of the same shape as input_tensor specifying
the positional embeddings based on sine features.
"""
if num_pos_features % 2 != 0:
raise ValueError(
"Number of embedding features (num_pos_features) must be even when "
"column and row embeddings are concatenated.")
num_pos_features = num_pos_features // 2
# Produce row and column embeddings based on total size of the image
# <tf.float>[batch_size, height, width]
attention_mask = tf.cast(attention_mask, tf.float32)
row_embedding = tf.cumsum(attention_mask, 1)
col_embedding = tf.cumsum(attention_mask, 2)
if normalize:
eps = 1e-6
row_embedding = row_embedding / (row_embedding[:, -1:, :] + eps) * scale
col_embedding = col_embedding / (col_embedding[:, :, -1:] + eps) * scale
dim_t = tf.range(num_pos_features, dtype=row_embedding.dtype)
dim_t = tf.pow(temperature, 2 * (dim_t // 2) / num_pos_features)
# Creates positional embeddings for each row and column position
# <tf.float>[batch_size, height, width, num_pos_features]
pos_row = tf.expand_dims(row_embedding, -1) / dim_t
pos_col = tf.expand_dims(col_embedding, -1) / dim_t
pos_row = tf.stack(
[tf.sin(pos_row[:, :, :, 0::2]),
tf.cos(pos_row[:, :, :, 1::2])], axis=4)
pos_col = tf.stack(
[tf.sin(pos_col[:, :, :, 0::2]),
tf.cos(pos_col[:, :, :, 1::2])], axis=4)
# final_shape = pos_row.shape.as_list()[:3] + [-1]
final_shape = tf_utils.get_shape_list(pos_row)[:3] + [-1]
pos_row = tf.reshape(pos_row, final_shape)
pos_col = tf.reshape(pos_col, final_shape)
output = tf.concat([pos_row, pos_col], -1)
embeddings = tf.cast(output, tf.float32)
return embeddings
class DETR(tf.keras.Model):
"""DETR model with Keras.
DETR consists of backbone, query embedding, DETRTransformer,
class and box heads.
"""
def __init__(self, num_queries, hidden_size, num_classes,
num_encoder_layers=6,
num_decoder_layers=6,
dropout_rate=0.1,
**kwargs):
super().__init__(**kwargs)
self._num_queries = num_queries
self._hidden_size = hidden_size
self._num_classes = num_classes
self._num_encoder_layers = num_encoder_layers
self._num_decoder_layers = num_decoder_layers
self._dropout_rate = dropout_rate
if hidden_size % 2 != 0:
raise ValueError("hidden_size must be a multiple of 2.")
# TODO(frederickliu): Consider using the backbone factory.
# TODO(frederickliu): Add to factory once we get skeleton code in.
self._backbone = resnet.ResNet(50, bn_trainable=False)
def build(self, input_shape=None):
self._input_proj = tf.keras.layers.Conv2D(
self._hidden_size, 1, name="detr/conv2d")
self._transformer = DETRTransformer(
num_encoder_layers=self._num_encoder_layers,
num_decoder_layers=self._num_decoder_layers,
dropout_rate=self._dropout_rate)
self._query_embeddings = self.add_weight(
"detr/query_embeddings",
shape=[self._num_queries, self._hidden_size],
initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=1.),
dtype=tf.float32)
sqrt_k = math.sqrt(1.0 / self._hidden_size)
self._class_embed = tf.keras.layers.Dense(
self._num_classes,
kernel_initializer=tf.keras.initializers.RandomUniform(-sqrt_k, sqrt_k),
name="detr/cls_dense")
self._bbox_embed = [
tf.keras.layers.Dense(
self._hidden_size, activation="relu",
kernel_initializer=tf.keras.initializers.RandomUniform(
-sqrt_k, sqrt_k),
name="detr/box_dense_0"),
tf.keras.layers.Dense(
self._hidden_size, activation="relu",
kernel_initializer=tf.keras.initializers.RandomUniform(
-sqrt_k, sqrt_k),
name="detr/box_dense_1"),
tf.keras.layers.Dense(
4, kernel_initializer=tf.keras.initializers.RandomUniform(
-sqrt_k, sqrt_k),
name="detr/box_dense_2")]
self._sigmoid = tf.keras.layers.Activation("sigmoid")
super().build(input_shape)
@property
def backbone(self) -> tf.keras.Model:
return self._backbone
def get_config(self):
return {
"num_queries": self._num_queries,
"hidden_size": self._hidden_size,
"num_classes": self._num_classes,
"num_encoder_layers": self._num_encoder_layers,
"num_decoder_layers": self._num_decoder_layers,
"dropout_rate": self._dropout_rate,
}
@classmethod
def from_config(cls, config):
return cls(**config)
def call(self, inputs):
batch_size = tf.shape(inputs)[0]
mask = tf.expand_dims(
tf.cast(tf.not_equal(tf.reduce_sum(inputs, axis=-1), 0), inputs.dtype),
axis=-1)
features = self._backbone(inputs)["5"]
shape = tf.shape(features)
mask = tf.image.resize(
mask, shape[1:3], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
pos_embed = position_embedding_sine(
mask[:, :, :, 0], num_pos_features=self._hidden_size)
pos_embed = tf.reshape(pos_embed, [batch_size, -1, self._hidden_size])
features = tf.reshape(
self._input_proj(features), [batch_size, -1, self._hidden_size])
mask = tf.reshape(mask, [batch_size, -1])
decoded_list = self._transformer({
"inputs":
features,
"targets":
tf.tile(
tf.expand_dims(self._query_embeddings, axis=0),
(batch_size, 1, 1)),
"pos_embed": pos_embed,
"mask": mask,
})
out_list = []
for decoded in decoded_list:
decoded = tf.stack(decoded)
output_class = self._class_embed(decoded)
box_out = decoded
for layer in self._bbox_embed:
box_out = layer(box_out)
output_coord = self._sigmoid(box_out)
out = {"cls_outputs": output_class, "box_outputs": output_coord}
out_list.append(out)
return out_list
class DETRTransformer(tf.keras.layers.Layer):
"""Encoder and Decoder of DETR."""
def __init__(self, num_encoder_layers=6, num_decoder_layers=6,
dropout_rate=0.1, **kwargs):
super().__init__(**kwargs)
self._dropout_rate = dropout_rate
self._num_encoder_layers = num_encoder_layers
self._num_decoder_layers = num_decoder_layers
def build(self, input_shape=None):
self._encoder = transformer.TransformerEncoder(
attention_dropout_rate=self._dropout_rate,
dropout_rate=self._dropout_rate,
intermediate_dropout=self._dropout_rate,
norm_first=False,
num_layers=self._num_encoder_layers,
)
self._decoder = transformer.TransformerDecoder(
attention_dropout_rate=self._dropout_rate,
dropout_rate=self._dropout_rate,
intermediate_dropout=self._dropout_rate,
norm_first=False,
num_layers=self._num_decoder_layers)
super().build(input_shape)
def get_config(self):
return {
"num_encoder_layers": self._num_encoder_layers,
"num_decoder_layers": self._num_decoder_layers,
"dropout_rate": self._dropout_rate,
}
def call(self, inputs):
sources = inputs["inputs"]
targets = inputs["targets"]
pos_embed = inputs["pos_embed"]
mask = inputs["mask"]
input_shape = tf_utils.get_shape_list(sources)
source_attention_mask = tf.tile(
tf.expand_dims(mask, axis=1), [1, input_shape[1], 1])
memory = self._encoder(
sources, attention_mask=source_attention_mask, pos_embed=pos_embed)
target_shape = tf_utils.get_shape_list(targets)
cross_attention_mask = tf.tile(
tf.expand_dims(mask, axis=1), [1, target_shape[1], 1])
target_shape = tf.shape(targets)
decoded = self._decoder(
tf.zeros_like(targets),
memory,
# TODO(b/199545430): self_attention_mask could be set to None when this
# bug is resolved. Passing ones for now.
self_attention_mask=tf.ones(
(target_shape[0], target_shape[1], target_shape[1])),
cross_attention_mask=cross_attention_mask,
return_all_decoder_outputs=True,
input_pos_embed=targets,
memory_pos_embed=pos_embed)
return decoded
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tensorflow_models.official.projects.detr.detr."""
import tensorflow as tf
from official.projects.detr.modeling import detr
class DetrTest(tf.test.TestCase):
def test_forward(self):
num_queries = 10
hidden_size = 128
num_classes = 10
image_size = 640
batch_size = 2
model = detr.DETR(num_queries, hidden_size, num_classes)
outs = model(tf.ones((batch_size, image_size, image_size, 3)))
self.assertLen(outs, 6) # intermediate decoded outputs.
for out in outs:
self.assertAllEqual(
tf.shape(out['cls_outputs']), (batch_size, num_queries, num_classes))
self.assertAllEqual(
tf.shape(out['box_outputs']), (batch_size, num_queries, 4))
def test_get_from_config_detr_transformer(self):
config = {
'num_encoder_layers': 1,
'num_decoder_layers': 2,
'dropout_rate': 0.5,
}
detr_model = detr.DETRTransformer.from_config(config)
retrieved_config = detr_model.get_config()
self.assertEqual(config, retrieved_config)
def test_get_from_config_detr(self):
config = {
'num_queries': 2,
'hidden_size': 4,
'num_classes': 10,
'num_encoder_layers': 4,
'num_decoder_layers': 5,
'dropout_rate': 0.5,
}
detr_model = detr.DETR.from_config(config)
retrieved_config = detr_model.get_config()
self.assertEqual(config, retrieved_config)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Specialized Transformers for DETR.
the position embeddings are added to the query and key for every self- and
cross-attention layer.
"""
import tensorflow as tf
from official.nlp.modeling import layers
from official.nlp.modeling import models
class TransformerEncoder(tf.keras.layers.Layer):
"""Transformer encoder.
Transformer encoder is made up of N identical layers. Each layer is composed
of the sublayers:
1. Self-attention layer
2. Feedforward network (which is 2 fully-connected layers)
"""
def __init__(self,
num_layers=6,
num_attention_heads=8,
intermediate_size=2048,
activation="relu",
dropout_rate=0.0,
attention_dropout_rate=0.0,
use_bias=False,
norm_first=True,
norm_epsilon=1e-6,
intermediate_dropout=0.0,
**kwargs):
"""Initialize a Transformer encoder.
Args:
num_layers: Number of layers.
num_attention_heads: Number of attention heads.
intermediate_size: Size of the intermediate (Feedforward) layer.
activation: Activation for the intermediate layer.
dropout_rate: Dropout probability.
attention_dropout_rate: Dropout probability for attention layers.
use_bias: Whether to enable use_bias in attention layer. If set False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
intermediate_dropout: Dropout probability for intermediate_dropout_layer.
**kwargs: key word arguemnts passed to tf.keras.layers.Layer.
"""
super(TransformerEncoder, self).__init__(**kwargs)
self.num_layers = num_layers
self.num_attention_heads = num_attention_heads
self._intermediate_size = intermediate_size
self._activation = activation
self._dropout_rate = dropout_rate
self._attention_dropout_rate = attention_dropout_rate
self._use_bias = use_bias
self._norm_first = norm_first
self._norm_epsilon = norm_epsilon
self._intermediate_dropout = intermediate_dropout
def build(self, input_shape):
"""Implements build() for the layer."""
self.encoder_layers = []
for i in range(self.num_layers):
self.encoder_layers.append(
TransformerEncoderBlock(
num_attention_heads=self.num_attention_heads,
inner_dim=self._intermediate_size,
inner_activation=self._activation,
output_dropout=self._dropout_rate,
attention_dropout=self._attention_dropout_rate,
use_bias=self._use_bias,
norm_first=self._norm_first,
norm_epsilon=self._norm_epsilon,
inner_dropout=self._intermediate_dropout,
attention_initializer=models.seq2seq_transformer
.attention_initializer(input_shape[2]),
name=("layer_%d" % i)))
self.output_normalization = tf.keras.layers.LayerNormalization(
epsilon=self._norm_epsilon, dtype="float32")
super(TransformerEncoder, self).build(input_shape)
def get_config(self):
config = {
"num_layers": self.num_layers,
"num_attention_heads": self.num_attention_heads,
"intermediate_size": self._intermediate_size,
"activation": self._activation,
"dropout_rate": self._dropout_rate,
"attention_dropout_rate": self._attention_dropout_rate,
"use_bias": self._use_bias,
"norm_first": self._norm_first,
"norm_epsilon": self._norm_epsilon,
"intermediate_dropout": self._intermediate_dropout
}
base_config = super(TransformerEncoder, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, encoder_inputs, attention_mask=None, pos_embed=None):
"""Return the output of the encoder.
Args:
encoder_inputs: A tensor with shape `(batch_size, input_length,
hidden_size)`.
attention_mask: A mask for the encoder self-attention layer with shape
`(batch_size, input_length, input_length)`.
pos_embed: Position embedding to add to every encoder layer.
Returns:
Output of encoder which is a `float32` tensor with shape
`(batch_size, input_length, hidden_size)`.
"""
for layer_idx in range(self.num_layers):
encoder_inputs = self.encoder_layers[layer_idx](
[encoder_inputs, attention_mask, pos_embed])
output_tensor = encoder_inputs
output_tensor = self.output_normalization(output_tensor)
return output_tensor
class TransformerEncoderBlock(tf.keras.layers.Layer):
"""TransformerEncoderBlock layer.
This layer implements the Transformer Encoder from
"Attention Is All You Need". (https://arxiv.org/abs/1706.03762),
which combines a `tf.keras.layers.MultiHeadAttention` layer with a
two-layer feedforward network. The only difference: position embedding is
added to the query and key of self-attention.
References:
[Attention Is All You Need](https://arxiv.org/abs/1706.03762)
[BERT: Pre-training of Deep Bidirectional Transformers for Language
Understanding](https://arxiv.org/abs/1810.04805)
"""
def __init__(self,
num_attention_heads,
inner_dim,
inner_activation,
output_range=None,
kernel_initializer="glorot_uniform",
bias_initializer="zeros",
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
use_bias=True,
norm_first=False,
norm_epsilon=1e-12,
output_dropout=0.0,
attention_dropout=0.0,
inner_dropout=0.0,
attention_initializer=None,
attention_axes=None,
**kwargs):
"""Initializes `TransformerEncoderBlock`.
Args:
num_attention_heads: Number of attention heads.
inner_dim: The output dimension of the first Dense layer in a two-layer
feedforward network.
inner_activation: The activation for the first Dense layer in a two-layer
feedforward network.
output_range: the sequence output range, [0, output_range) for slicing the
target sequence. `None` means the target sequence is not sliced.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
use_bias: Whether to enable use_bias in attention layer. If set False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
output_dropout: Dropout probability for the post-attention and output
dropout.
attention_dropout: Dropout probability for within the attention layer.
inner_dropout: Dropout probability for the first Dense layer in a
two-layer feedforward network.
attention_initializer: Initializer for kernels of attention layers. If set
`None`, attention layers use kernel_initializer as initializer for
kernel.
attention_axes: axes over which the attention is applied. `None` means
attention over all axes, but batch, heads, and features.
**kwargs: keyword arguments/
"""
super().__init__(**kwargs)
self._num_heads = num_attention_heads
self._inner_dim = inner_dim
self._inner_activation = inner_activation
self._attention_dropout = attention_dropout
self._attention_dropout_rate = attention_dropout
self._output_dropout = output_dropout
self._output_dropout_rate = output_dropout
self._output_range = output_range
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._bias_initializer = tf.keras.initializers.get(bias_initializer)
self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint)
self._use_bias = use_bias
self._norm_first = norm_first
self._norm_epsilon = norm_epsilon
self._inner_dropout = inner_dropout
if attention_initializer:
self._attention_initializer = tf.keras.initializers.get(
attention_initializer)
else:
self._attention_initializer = self._kernel_initializer
self._attention_axes = attention_axes
def build(self, input_shape):
if isinstance(input_shape, tf.TensorShape):
input_tensor_shape = input_shape
elif isinstance(input_shape, (list, tuple)):
input_tensor_shape = tf.TensorShape(input_shape[0])
else:
raise ValueError(
"The type of input shape argument is not supported, got: %s" %
type(input_shape))
einsum_equation = "abc,cd->abd"
if len(input_tensor_shape.as_list()) > 3:
einsum_equation = "...bc,cd->...bd"
hidden_size = input_tensor_shape[-1]
if hidden_size % self._num_heads != 0:
raise ValueError(
"The input size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, self._num_heads))
self._attention_head_size = int(hidden_size // self._num_heads)
common_kwargs = dict(
bias_initializer=self._bias_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint)
self._attention_layer = tf.keras.layers.MultiHeadAttention(
num_heads=self._num_heads,
key_dim=self._attention_head_size,
dropout=self._attention_dropout,
use_bias=self._use_bias,
kernel_initializer=self._attention_initializer,
attention_axes=self._attention_axes,
name="self_attention",
**common_kwargs)
self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
# Use float32 in layernorm for numeric stability.
# It is probably safe in mixed_float16, but we haven't validated this yet.
self._attention_layer_norm = (
tf.keras.layers.LayerNormalization(
name="self_attention_layer_norm",
axis=-1,
epsilon=self._norm_epsilon,
dtype=tf.float32))
self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
einsum_equation,
output_shape=(None, self._inner_dim),
bias_axes="d",
kernel_initializer=self._kernel_initializer,
name="intermediate",
**common_kwargs)
policy = tf.keras.mixed_precision.global_policy()
if policy.name == "mixed_bfloat16":
# bfloat16 causes BERT with the LAMB optimizer to not converge
# as well, so we use float32.
# TODO(b/154538392): Investigate this.
policy = tf.float32
self._intermediate_activation_layer = tf.keras.layers.Activation(
self._inner_activation, dtype=policy)
self._inner_dropout_layer = tf.keras.layers.Dropout(
rate=self._inner_dropout)
self._output_dense = tf.keras.layers.experimental.EinsumDense(
einsum_equation,
output_shape=(None, hidden_size),
bias_axes="d",
name="output",
kernel_initializer=self._kernel_initializer,
**common_kwargs)
self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
# Use float32 in layernorm for numeric stability.
self._output_layer_norm = tf.keras.layers.LayerNormalization(
name="output_layer_norm",
axis=-1,
epsilon=self._norm_epsilon,
dtype=tf.float32)
super(TransformerEncoderBlock, self).build(input_shape)
def get_config(self):
config = {
"num_attention_heads":
self._num_heads,
"inner_dim":
self._inner_dim,
"inner_activation":
self._inner_activation,
"output_dropout":
self._output_dropout_rate,
"attention_dropout":
self._attention_dropout_rate,
"output_range":
self._output_range,
"kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer":
tf.keras.initializers.serialize(self._bias_initializer),
"kernel_regularizer":
tf.keras.regularizers.serialize(self._kernel_regularizer),
"bias_regularizer":
tf.keras.regularizers.serialize(self._bias_regularizer),
"activity_regularizer":
tf.keras.regularizers.serialize(self._activity_regularizer),
"kernel_constraint":
tf.keras.constraints.serialize(self._kernel_constraint),
"bias_constraint":
tf.keras.constraints.serialize(self._bias_constraint),
"use_bias":
self._use_bias,
"norm_first":
self._norm_first,
"norm_epsilon":
self._norm_epsilon,
"inner_dropout":
self._inner_dropout,
"attention_initializer":
tf.keras.initializers.serialize(self._attention_initializer),
"attention_axes":
self._attention_axes,
}
base_config = super(TransformerEncoderBlock, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self, inputs):
"""Transformer self-attention encoder block call.
Args:
inputs: a single tensor or a list of tensors. `input tensor` as the single
sequence of embeddings. [`input tensor`, `attention mask`] to have the
additional attention mask. [`input tensor`, `attention mask`, `query
embed`] to have an additional position embedding to add.
Returns:
An output tensor with the same dimensions as input/query tensor.
"""
input_tensor, attention_mask, pos_embed = inputs
key_value = None
if self._output_range:
if self._norm_first:
source_tensor = input_tensor[:, 0:self._output_range, :]
input_tensor = self._attention_layer_norm(input_tensor)
if key_value is not None:
key_value = self._attention_layer_norm(key_value)
target_tensor = input_tensor[:, 0:self._output_range, :]
if attention_mask is not None:
attention_mask = attention_mask[:, 0:self._output_range, :]
else:
if self._norm_first:
source_tensor = input_tensor
input_tensor = self._attention_layer_norm(input_tensor)
if key_value is not None:
key_value = self._attention_layer_norm(key_value)
target_tensor = input_tensor
if key_value is None:
key_value = input_tensor
attention_output = self._attention_layer(
query=target_tensor + pos_embed,
key=key_value + pos_embed,
value=key_value,
attention_mask=attention_mask)
attention_output = self._attention_dropout(attention_output)
if self._norm_first:
attention_output = source_tensor + attention_output
else:
attention_output = self._attention_layer_norm(target_tensor +
attention_output)
if self._norm_first:
source_attention_output = attention_output
attention_output = self._output_layer_norm(attention_output)
inner_output = self._intermediate_dense(attention_output)
inner_output = self._intermediate_activation_layer(inner_output)
inner_output = self._inner_dropout_layer(inner_output)
layer_output = self._output_dense(inner_output)
layer_output = self._output_dropout(layer_output)
if self._norm_first:
return source_attention_output + layer_output
# During mixed precision training, layer norm output is always fp32 for now.
# Casts fp32 for the subsequent add.
layer_output = tf.cast(layer_output, tf.float32)
return self._output_layer_norm(layer_output + attention_output)
class TransformerDecoder(tf.keras.layers.Layer):
"""Transformer decoder.
Like the encoder, the decoder is made up of N identical layers.
Each layer is composed of the sublayers:
1. Self-attention layer
2. Multi-headed attention layer combining encoder outputs with results from
the previous self-attention layer.
3. Feedforward network (2 fully-connected layers)
"""
def __init__(self,
num_layers=6,
num_attention_heads=8,
intermediate_size=2048,
activation="relu",
dropout_rate=0.0,
attention_dropout_rate=0.0,
use_bias=False,
norm_first=True,
norm_epsilon=1e-6,
intermediate_dropout=0.0,
**kwargs):
"""Initialize a Transformer decoder.
Args:
num_layers: Number of layers.
num_attention_heads: Number of attention heads.
intermediate_size: Size of the intermediate (Feedforward) layer.
activation: Activation for the intermediate layer.
dropout_rate: Dropout probability.
attention_dropout_rate: Dropout probability for attention layers.
use_bias: Whether to enable use_bias in attention layer. If set `False`,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set `False`, output of attention and intermediate dense
layers is normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
intermediate_dropout: Dropout probability for intermediate_dropout_layer.
**kwargs: key word arguemnts passed to tf.keras.layers.Layer.
"""
super(TransformerDecoder, self).__init__(**kwargs)
self.num_layers = num_layers
self.num_attention_heads = num_attention_heads
self._intermediate_size = intermediate_size
self._activation = activation
self._dropout_rate = dropout_rate
self._attention_dropout_rate = attention_dropout_rate
self._use_bias = use_bias
self._norm_first = norm_first
self._norm_epsilon = norm_epsilon
self._intermediate_dropout = intermediate_dropout
def build(self, input_shape):
"""Implements build() for the layer."""
self.decoder_layers = []
for i in range(self.num_layers):
self.decoder_layers.append(
TransformerDecoderBlock(
num_attention_heads=self.num_attention_heads,
intermediate_size=self._intermediate_size,
intermediate_activation=self._activation,
dropout_rate=self._dropout_rate,
attention_dropout_rate=self._attention_dropout_rate,
use_bias=self._use_bias,
norm_first=self._norm_first,
norm_epsilon=self._norm_epsilon,
intermediate_dropout=self._intermediate_dropout,
attention_initializer=models.seq2seq_transformer
.attention_initializer(input_shape[2]),
name=("layer_%d" % i)))
self.output_normalization = tf.keras.layers.LayerNormalization(
epsilon=self._norm_epsilon, dtype="float32")
super(TransformerDecoder, self).build(input_shape)
def get_config(self):
config = {
"num_layers": self.num_layers,
"num_attention_heads": self.num_attention_heads,
"intermediate_size": self._intermediate_size,
"activation": self._activation,
"dropout_rate": self._dropout_rate,
"attention_dropout_rate": self._attention_dropout_rate,
"use_bias": self._use_bias,
"norm_first": self._norm_first,
"norm_epsilon": self._norm_epsilon,
"intermediate_dropout": self._intermediate_dropout
}
base_config = super(TransformerDecoder, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def call(self,
target,
memory,
self_attention_mask=None,
cross_attention_mask=None,
cache=None,
decode_loop_step=None,
return_all_decoder_outputs=False,
input_pos_embed=None,
memory_pos_embed=None):
"""Return the output of the decoder layer stacks.
Args:
target: A tensor with shape `(batch_size, target_length, hidden_size)`.
memory: A tensor with shape `(batch_size, input_length, hidden_size)`.
self_attention_mask: A tensor with shape `(batch_size, target_len,
target_length)`, the mask for decoder self-attention layer.
cross_attention_mask: A tensor with shape `(batch_size, target_length,
input_length)` which is the mask for encoder-decoder attention layer.
cache: (Used for fast decoding) A nested dictionary storing previous
decoder self-attention values. The items are:
{layer_n: {"k": A tensor with shape `(batch_size, i, key_channels)`,
"v": A tensor with shape `(batch_size, i, value_channels)`},
...}
decode_loop_step: An integer, the step number of the decoding loop. Used
only for autoregressive inference on TPU.
return_all_decoder_outputs: Return all decoder layer outputs. Note that
the outputs are layer normed. This is useful when introducing per layer
auxiliary loss.
input_pos_embed: A tensor that is added to the query and key of the
self-attention layer.
memory_pos_embed: A tensor that is added to the query and key of the
cross-attention layer.
Returns:
Output of decoder.
float32 tensor with shape `(batch_size, target_length, hidden_size`).
"""
output_tensor = target
decoder_outputs = []
for layer_idx in range(self.num_layers):
transformer_inputs = [
output_tensor, memory, cross_attention_mask, self_attention_mask,
input_pos_embed, memory_pos_embed
]
# Gets the cache for decoding.
if cache is None:
output_tensor, _ = self.decoder_layers[layer_idx](transformer_inputs)
else:
cache_layer_idx = str(layer_idx)
output_tensor, cache[cache_layer_idx] = self.decoder_layers[layer_idx](
transformer_inputs,
cache=cache[cache_layer_idx],
decode_loop_step=decode_loop_step)
if return_all_decoder_outputs:
decoder_outputs.append(self.output_normalization(output_tensor))
if return_all_decoder_outputs:
return decoder_outputs
else:
return self.output_normalization(output_tensor)
class TransformerDecoderBlock(tf.keras.layers.Layer):
"""Single transformer layer for decoder.
It has three sub-layers:
(1) a multi-head self-attention mechanism.
(2) a encoder-decoder attention.
(3) a positionwise fully connected feed-forward network.
"""
def __init__(self,
num_attention_heads,
intermediate_size,
intermediate_activation,
dropout_rate=0.0,
attention_dropout_rate=0.0,
kernel_initializer="glorot_uniform",
bias_initializer="zeros",
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None,
kernel_constraint=None,
bias_constraint=None,
use_bias=True,
norm_first=False,
norm_epsilon=1e-12,
intermediate_dropout=0.0,
attention_initializer=None,
**kwargs):
"""Initialize a Transformer decoder block.
Args:
num_attention_heads: Number of attention heads.
intermediate_size: Size of the intermediate layer.
intermediate_activation: Activation for the intermediate layer.
dropout_rate: Dropout probability for the post-attention and output
dropout.
attention_dropout_rate: Dropout probability for within the attention
layer.
kernel_initializer: Initializer for dense layer kernels.
bias_initializer: Initializer for dense layer biases.
kernel_regularizer: Regularizer for dense layer kernels.
bias_regularizer: Regularizer for dense layer biases.
activity_regularizer: Regularizer for dense layer activity.
kernel_constraint: Constraint for dense layer kernels.
bias_constraint: Constraint for dense layer kernels.
use_bias: Whether to enable use_bias in attention layer. If set False,
use_bias in attention layer is disabled.
norm_first: Whether to normalize inputs to attention and intermediate
dense layers. If set False, output of attention and intermediate dense
layers is normalized.
norm_epsilon: Epsilon value to initialize normalization layers.
intermediate_dropout: Dropout probability for intermediate_dropout_layer.
attention_initializer: Initializer for kernels of attention layers. If set
`None`, attention layers use kernel_initializer as initializer for
kernel.
**kwargs: key word arguemnts passed to tf.keras.layers.Layer.
"""
super().__init__(**kwargs)
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.intermediate_activation = tf.keras.activations.get(
intermediate_activation)
self.dropout_rate = dropout_rate
self.attention_dropout_rate = attention_dropout_rate
self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self._bias_initializer = tf.keras.initializers.get(bias_initializer)
self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self._bias_constraint = tf.keras.constraints.get(bias_constraint)
self._use_bias = use_bias
self._norm_first = norm_first
self._norm_epsilon = norm_epsilon
self._intermediate_dropout = intermediate_dropout
if attention_initializer:
self._attention_initializer = tf.keras.initializers.get(
attention_initializer)
else:
self._attention_initializer = self._kernel_initializer
self._cross_attention_cls = layers.attention.MultiHeadAttention
def build(self, input_shape):
target_tensor_shape = tf.TensorShape(input_shape[0])
if len(target_tensor_shape.as_list()) != 3:
raise ValueError("TransformerLayer expects a three-dimensional input of "
"shape [batch, sequence, width].")
hidden_size = target_tensor_shape[2]
if hidden_size % self.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, self.num_attention_heads))
self.attention_head_size = int(hidden_size) // self.num_attention_heads
common_kwargs = dict(
bias_initializer=self._bias_initializer,
kernel_regularizer=self._kernel_regularizer,
bias_regularizer=self._bias_regularizer,
activity_regularizer=self._activity_regularizer,
kernel_constraint=self._kernel_constraint,
bias_constraint=self._bias_constraint)
# Self attention.
self.self_attention = layers.attention.CachedAttention(
num_heads=self.num_attention_heads,
key_dim=self.attention_head_size,
dropout=self.attention_dropout_rate,
use_bias=self._use_bias,
kernel_initializer=self._attention_initializer,
name="self_attention",
**common_kwargs)
self.self_attention_output_dense = tf.keras.layers.experimental.EinsumDense(
"abc,cd->abd",
output_shape=(None, hidden_size),
bias_axes="d",
kernel_initializer=self._kernel_initializer,
name="output",
**common_kwargs)
self.self_attention_dropout = tf.keras.layers.Dropout(
rate=self.dropout_rate)
self.self_attention_layer_norm = (
tf.keras.layers.LayerNormalization(
name="self_attention_layer_norm",
axis=-1,
epsilon=self._norm_epsilon,
dtype="float32"))
# Encoder-decoder attention.
self.encdec_attention = self._cross_attention_cls(
num_heads=self.num_attention_heads,
key_dim=self.attention_head_size,
dropout=self.attention_dropout_rate,
output_shape=hidden_size,
use_bias=self._use_bias,
kernel_initializer=self._attention_initializer,
name="attention/encdec",
**common_kwargs)
self.encdec_attention_dropout = tf.keras.layers.Dropout(
rate=self.dropout_rate)
self.encdec_attention_layer_norm = (
tf.keras.layers.LayerNormalization(
name="attention/encdec_output_layer_norm",
axis=-1,
epsilon=self._norm_epsilon,
dtype="float32"))
# Feed-forward projection.
self.intermediate_dense = tf.keras.layers.experimental.EinsumDense(
"abc,cd->abd",
output_shape=(None, self.intermediate_size),
bias_axes="d",
kernel_initializer=self._kernel_initializer,
name="intermediate",
**common_kwargs)
self.intermediate_activation_layer = tf.keras.layers.Activation(
self.intermediate_activation)
self._intermediate_dropout_layer = tf.keras.layers.Dropout(
rate=self._intermediate_dropout)
self.output_dense = tf.keras.layers.experimental.EinsumDense(
"abc,cd->abd",
output_shape=(None, hidden_size),
bias_axes="d",
kernel_initializer=self._kernel_initializer,
name="output",
**common_kwargs)
self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
self.output_layer_norm = tf.keras.layers.LayerNormalization(
name="output_layer_norm",
axis=-1,
epsilon=self._norm_epsilon,
dtype="float32")
super().build(input_shape)
def get_config(self):
config = {
"num_attention_heads":
self.num_attention_heads,
"intermediate_size":
self.intermediate_size,
"intermediate_activation":
tf.keras.activations.serialize(self.intermediate_activation),
"dropout_rate":
self.dropout_rate,
"attention_dropout_rate":
self.attention_dropout_rate,
"kernel_initializer":
tf.keras.initializers.serialize(self._kernel_initializer),
"bias_initializer":
tf.keras.initializers.serialize(self._bias_initializer),
"kernel_regularizer":
tf.keras.regularizers.serialize(self._kernel_regularizer),
"bias_regularizer":
tf.keras.regularizers.serialize(self._bias_regularizer),
"activity_regularizer":
tf.keras.regularizers.serialize(self._activity_regularizer),
"kernel_constraint":
tf.keras.constraints.serialize(self._kernel_constraint),
"bias_constraint":
tf.keras.constraints.serialize(self._bias_constraint),
"use_bias":
self._use_bias,
"norm_first":
self._norm_first,
"norm_epsilon":
self._norm_epsilon,
"intermediate_dropout":
self._intermediate_dropout,
"attention_initializer":
tf.keras.initializers.serialize(self._attention_initializer)
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
def common_layers_with_encoder(self):
"""Gets layer objects that can make a Transformer encoder block."""
return [
self.self_attention, self.self_attention_layer_norm,
self.intermediate_dense, self.output_dense, self.output_layer_norm
]
def call(self, inputs, cache=None, decode_loop_step=None):
input_tensor, memory, attention_mask, self_attention_mask, input_pos_embed, memory_pos_embed = inputs
source_tensor = input_tensor
if self._norm_first:
input_tensor = self.self_attention_layer_norm(input_tensor)
self_attention_output, cache = self.self_attention(
query=input_tensor + input_pos_embed,
key=input_tensor + input_pos_embed,
value=input_tensor,
attention_mask=self_attention_mask,
cache=cache,
decode_loop_step=decode_loop_step)
self_attention_output = self.self_attention_dropout(self_attention_output)
if self._norm_first:
self_attention_output = source_tensor + self_attention_output
else:
self_attention_output = self.self_attention_layer_norm(
input_tensor + self_attention_output)
if self._norm_first:
source_self_attention_output = self_attention_output
self_attention_output = self.encdec_attention_layer_norm(
self_attention_output)
cross_attn_inputs = dict(
query=self_attention_output + input_pos_embed,
key=memory + memory_pos_embed,
value=memory,
attention_mask=attention_mask)
attention_output = self.encdec_attention(**cross_attn_inputs)
attention_output = self.encdec_attention_dropout(attention_output)
if self._norm_first:
attention_output = source_self_attention_output + attention_output
else:
attention_output = self.encdec_attention_layer_norm(
self_attention_output + attention_output)
if self._norm_first:
source_attention_output = attention_output
attention_output = self.output_layer_norm(attention_output)
intermediate_output = self.intermediate_dense(attention_output)
intermediate_output = self.intermediate_activation_layer(
intermediate_output)
intermediate_output = self._intermediate_dropout_layer(intermediate_output)
layer_output = self.output_dense(intermediate_output)
layer_output = self.output_dropout(layer_output)
if self._norm_first:
layer_output = source_attention_output + layer_output
else:
layer_output = self.output_layer_norm(layer_output + attention_output)
return layer_output, cache
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for transformer."""
import tensorflow as tf
from official.projects.detr.modeling import transformer
class TransformerTest(tf.test.TestCase):
def test_transformer_encoder_block(self):
batch_size = 2
sequence_length = 100
feature_size = 256
num_attention_heads = 2
inner_dim = 256
inner_activation = 'relu'
model = transformer.TransformerEncoderBlock(num_attention_heads, inner_dim,
inner_activation)
input_tensor = tf.ones((batch_size, sequence_length, feature_size))
attention_mask = tf.ones((batch_size, sequence_length, sequence_length),
dtype=tf.int64)
pos_embed = tf.ones((batch_size, sequence_length, feature_size))
out = model([input_tensor, attention_mask, pos_embed])
self.assertAllEqual(
tf.shape(out), (batch_size, sequence_length, feature_size))
def test_transformer_encoder_block_get_config(self):
num_attention_heads = 2
inner_dim = 256
inner_activation = 'relu'
model = transformer.TransformerEncoderBlock(num_attention_heads, inner_dim,
inner_activation)
config = model.get_config()
expected_config = {
'name': 'transformer_encoder_block',
'trainable': True,
'dtype': 'float32',
'num_attention_heads': 2,
'inner_dim': 256,
'inner_activation': 'relu',
'output_dropout': 0.0,
'attention_dropout': 0.0,
'output_range': None,
'kernel_initializer': {
'class_name': 'GlorotUniform',
'config': {
'seed': None}
},
'bias_initializer': {
'class_name': 'Zeros',
'config': {}
},
'kernel_regularizer': None,
'bias_regularizer': None,
'activity_regularizer': None,
'kernel_constraint': None,
'bias_constraint': None,
'use_bias': True,
'norm_first': False,
'norm_epsilon': 1e-12,
'inner_dropout': 0.0,
'attention_initializer': {
'class_name': 'GlorotUniform',
'config': {'seed': None}
},
'attention_axes': None}
self.assertAllEqual(expected_config, config)
def test_transformer_encoder(self):
batch_size = 2
sequence_length = 100
feature_size = 256
num_layers = 2
num_attention_heads = 2
intermediate_size = 256
model = transformer.TransformerEncoder(
num_layers=num_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size)
input_tensor = tf.ones((batch_size, sequence_length, feature_size))
attention_mask = tf.ones((batch_size, sequence_length, sequence_length),
dtype=tf.int64)
pos_embed = tf.ones((batch_size, sequence_length, feature_size))
out = model(input_tensor, attention_mask, pos_embed)
self.assertAllEqual(
tf.shape(out), (batch_size, sequence_length, feature_size))
def test_transformer_encoder_get_config(self):
num_layers = 2
num_attention_heads = 2
intermediate_size = 256
model = transformer.TransformerEncoder(
num_layers=num_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size)
config = model.get_config()
expected_config = {
'name': 'transformer_encoder',
'trainable': True,
'dtype': 'float32',
'num_layers': 2,
'num_attention_heads': 2,
'intermediate_size': 256,
'activation': 'relu',
'dropout_rate': 0.0,
'attention_dropout_rate': 0.0,
'use_bias': False,
'norm_first': True,
'norm_epsilon': 1e-06,
'intermediate_dropout': 0.0
}
self.assertAllEqual(expected_config, config)
def test_transformer_decoder_block(self):
batch_size = 2
sequence_length = 100
memory_length = 200
feature_size = 256
num_attention_heads = 2
intermediate_size = 256
intermediate_activation = 'relu'
model = transformer.TransformerDecoderBlock(num_attention_heads,
intermediate_size,
intermediate_activation)
input_tensor = tf.ones((batch_size, sequence_length, feature_size))
memory = tf.ones((batch_size, memory_length, feature_size))
attention_mask = tf.ones((batch_size, sequence_length, memory_length),
dtype=tf.int64)
self_attention_mask = tf.ones(
(batch_size, sequence_length, sequence_length), dtype=tf.int64)
input_pos_embed = tf.ones((batch_size, sequence_length, feature_size))
memory_pos_embed = tf.ones((batch_size, memory_length, feature_size))
out, _ = model([
input_tensor, memory, attention_mask, self_attention_mask,
input_pos_embed, memory_pos_embed
])
self.assertAllEqual(
tf.shape(out), (batch_size, sequence_length, feature_size))
def test_transformer_decoder_block_get_config(self):
num_attention_heads = 2
intermediate_size = 256
intermediate_activation = 'relu'
model = transformer.TransformerDecoderBlock(num_attention_heads,
intermediate_size,
intermediate_activation)
config = model.get_config()
expected_config = {
'name': 'transformer_decoder_block',
'trainable': True,
'dtype': 'float32',
'num_attention_heads': 2,
'intermediate_size': 256,
'intermediate_activation': 'relu',
'dropout_rate': 0.0,
'attention_dropout_rate': 0.0,
'kernel_initializer': {
'class_name': 'GlorotUniform',
'config': {
'seed': None
}
},
'bias_initializer': {
'class_name': 'Zeros',
'config': {}
},
'kernel_regularizer': None,
'bias_regularizer': None,
'activity_regularizer': None,
'kernel_constraint': None,
'bias_constraint': None,
'use_bias': True,
'norm_first': False,
'norm_epsilon': 1e-12,
'intermediate_dropout': 0.0,
'attention_initializer': {
'class_name': 'GlorotUniform',
'config': {
'seed': None
}
}
}
self.assertAllEqual(expected_config, config)
def test_transformer_decoder(self):
batch_size = 2
sequence_length = 100
memory_length = 200
feature_size = 256
num_layers = 2
num_attention_heads = 2
intermediate_size = 256
model = transformer.TransformerDecoder(
num_layers=num_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size)
input_tensor = tf.ones((batch_size, sequence_length, feature_size))
memory = tf.ones((batch_size, memory_length, feature_size))
attention_mask = tf.ones((batch_size, sequence_length, memory_length),
dtype=tf.int64)
self_attention_mask = tf.ones(
(batch_size, sequence_length, sequence_length), dtype=tf.int64)
input_pos_embed = tf.ones((batch_size, sequence_length, feature_size))
memory_pos_embed = tf.ones((batch_size, memory_length, feature_size))
outs = model(
input_tensor,
memory,
self_attention_mask,
attention_mask,
return_all_decoder_outputs=True,
input_pos_embed=input_pos_embed,
memory_pos_embed=memory_pos_embed)
self.assertLen(outs, 2) # intermeidate decoded outputs.
for out in outs:
self.assertAllEqual(
tf.shape(out), (batch_size, sequence_length, feature_size))
def test_transformer_decoder_get_config(self):
num_layers = 2
num_attention_heads = 2
intermediate_size = 256
model = transformer.TransformerDecoder(
num_layers=num_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size)
config = model.get_config()
expected_config = {
'name': 'transformer_decoder',
'trainable': True,
'dtype': 'float32',
'num_layers': 2,
'num_attention_heads': 2,
'intermediate_size': 256,
'activation': 'relu',
'dropout_rate': 0.0,
'attention_dropout_rate': 0.0,
'use_bias': False,
'norm_first': True,
'norm_epsilon': 1e-06,
'intermediate_dropout': 0.0
}
self.assertAllEqual(expected_config, config)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tensorflow implementation to solve the Linear Sum Assignment problem.
The Linear Sum Assignment problem involves determining the minimum weight
matching for bipartite graphs. For example, this problem can be defined by
a 2D matrix C, where each element i,j determines the cost of matching worker i
with job j. The solution to the problem is a complete assignment of jobs to
workers, such that no job is assigned to more than one work and no worker is
assigned more than one job, with minimum cost.
This implementation builds off of the Hungarian
Matching Algorithm (https://www.cse.ust.hk/~golin/COMP572/Notes/Matching.pdf).
Based on the original implementation by Jiquan Ngiam <jngiam@google.com>.
"""
import tensorflow as tf
from official.modeling import tf_utils
def _prepare(weights):
"""Prepare the cost matrix.
To speed up computational efficiency of the algorithm, all weights are shifted
to be non-negative. Each element is reduced by the row / column minimum. Note
that neither operation will effect the resulting solution but will provide
a better starting point for the greedy assignment. Note this corresponds to
the pre-processing and step 1 of the Hungarian algorithm from Wikipedia.
Args:
weights: A float32 [batch_size, num_elems, num_elems] tensor, where each
inner matrix represents weights to be use for matching.
Returns:
A prepared weights tensor of the same shape and dtype.
"""
# Since every worker needs a job and every job needs a worker, we can subtract
# the minimum from each.
weights -= tf.reduce_min(weights, axis=2, keepdims=True)
weights -= tf.reduce_min(weights, axis=1, keepdims=True)
return weights
def _greedy_assignment(adj_matrix):
"""Greedily assigns workers to jobs based on an adjaceny matrix.
Starting with an adjacency matrix representing the available connections
in the bi-partite graph, this function greedily chooses elements such
that each worker is matched to at most one job (or each job is assigned to
at most one worker). Note, if the adjacency matrix has no available values
for a particular row/column, the corresponding job/worker may go unassigned.
Args:
adj_matrix: A bool [batch_size, num_elems, num_elems] tensor, where each
element of the inner matrix represents whether the worker (row) can be
matched to the job (column).
Returns:
A bool [batch_size, num_elems, num_elems] tensor, where each element of the
inner matrix represents whether the worker has been matched to the job.
Each row and column can have at most one true element. Some of the rows
and columns may not be matched.
"""
_, num_elems, _ = tf_utils.get_shape_list(adj_matrix, expected_rank=3)
adj_matrix = tf.transpose(adj_matrix, [1, 0, 2])
# Create a dynamic TensorArray containing the assignments for each worker/job
assignment = tf.TensorArray(tf.bool, num_elems)
# Store the elements assigned to each column to update each iteration
col_assigned = tf.zeros_like(adj_matrix[0, ...], dtype=tf.bool)
# Iteratively assign each row using tf.foldl. Intuitively, this is a loop
# over rows, where we incrementally assign each row.
def _assign_row(accumulator, row_adj):
# The accumulator tracks the row assignment index.
idx, assignment, col_assigned = accumulator
# Viable candidates cannot already be assigned to another job.
candidates = row_adj & (~col_assigned)
# Deterministically assign to the candidates of the highest index count.
max_candidate_idx = tf.argmax(
tf.cast(candidates, tf.int32), axis=1, output_type=tf.int32)
candidates_indicator = tf.one_hot(
max_candidate_idx,
num_elems,
on_value=True,
off_value=False,
dtype=tf.bool)
candidates_indicator &= candidates
# Make assignment to the column.
col_assigned |= candidates_indicator
assignment = assignment.write(idx, candidates_indicator)
return (idx + 1, assignment, col_assigned)
_, assignment, _ = tf.foldl(
_assign_row, adj_matrix, (0, assignment, col_assigned), back_prop=False)
assignment = assignment.stack()
assignment = tf.transpose(assignment, [1, 0, 2])
return assignment
def _find_augmenting_path(assignment, adj_matrix):
"""Finds an augmenting path given an assignment and an adjacency matrix.
The augmenting path search starts from the unassigned workers, then goes on
to find jobs (via an unassigned pairing), then back again to workers (via an
existing pairing), and so on. The path alternates between unassigned and
existing pairings. Returns the state after the search.
Note: In the state the worker and job, indices are 1-indexed so that we can
use 0 to represent unreachable nodes. State contains the following keys:
- jobs: A [batch_size, 1, num_elems] tensor containing the highest index
unassigned worker that can reach this job through a path.
- jobs_from_worker: A [batch_size, num_elems] tensor containing the worker
reached immediately before this job.
- workers: A [batch_size, num_elems, 1] tensor containing the highest index
unassigned worker that can reach this worker through a path.
- workers_from_job: A [batch_size, num_elems] tensor containing the job
reached immediately before this worker.
- new_jobs: A bool [batch_size, num_elems] tensor containing True if the
unassigned job can be reached via a path.
State can be used to recover the path via backtracking.
Args:
assignment: A bool [batch_size, num_elems, num_elems] tensor, where each
element of the inner matrix represents whether the worker has been matched
to the job. This may be a partial assignment.
adj_matrix: A bool [batch_size, num_elems, num_elems] tensor, where each
element of the inner matrix represents whether the worker (row) can be
matched to the job (column).
Returns:
A state dict, which represents the outcome of running an augmenting
path search on the graph given the assignment.
"""
batch_size, num_elems, _ = tf_utils.get_shape_list(
assignment, expected_rank=3)
unassigned_workers = ~tf.reduce_any(assignment, axis=2, keepdims=True)
unassigned_jobs = ~tf.reduce_any(assignment, axis=1, keepdims=True)
unassigned_pairings = tf.cast(adj_matrix & ~assignment, tf.int32)
existing_pairings = tf.cast(assignment, tf.int32)
# Initialize unassigned workers to have non-zero ids, assigned workers will
# have ids = 0.
worker_indices = tf.range(1, num_elems + 1, dtype=tf.int32)
init_workers = tf.tile(worker_indices[tf.newaxis, :, tf.newaxis],
[batch_size, 1, 1])
init_workers *= tf.cast(unassigned_workers, tf.int32)
state = {
"jobs": tf.zeros((batch_size, 1, num_elems), dtype=tf.int32),
"jobs_from_worker": tf.zeros((batch_size, num_elems), dtype=tf.int32),
"workers": init_workers,
"workers_from_job": tf.zeros((batch_size, num_elems), dtype=tf.int32)
}
def _has_active_workers(state, curr_workers):
"""Check if there are still active workers."""
del state
return tf.reduce_sum(curr_workers) > 0
def _augment_step(state, curr_workers):
"""Performs one search step."""
# Note: These steps could be potentially much faster if sparse matrices are
# supported. The unassigned_pairings and existing_pairings matrices can be
# very sparse.
# Find potential jobs using current workers.
potential_jobs = curr_workers * unassigned_pairings
curr_jobs = tf.reduce_max(potential_jobs, axis=1, keepdims=True)
curr_jobs_from_worker = 1 + tf.argmax(
potential_jobs, axis=1, output_type=tf.int32)
# Remove already accessible jobs from curr_jobs.
default_jobs = tf.zeros_like(state["jobs"], dtype=state["jobs"].dtype)
curr_jobs = tf.where(state["jobs"] > 0, default_jobs, curr_jobs)
curr_jobs_from_worker *= tf.cast(curr_jobs > 0, tf.int32)[:, 0, :]
# Find potential workers from current jobs.
potential_workers = curr_jobs * existing_pairings
curr_workers = tf.reduce_max(potential_workers, axis=2, keepdims=True)
curr_workers_from_job = 1 + tf.argmax(
potential_workers, axis=2, output_type=tf.int32)
# Remove already accessible workers from curr_workers.
default_workers = tf.zeros_like(state["workers"])
curr_workers = tf.where(
state["workers"] > 0, default_workers, curr_workers)
curr_workers_from_job *= tf.cast(curr_workers > 0, tf.int32)[:, :, 0]
# Update state so that we can backtrack later.
state = state.copy()
state["jobs"] = tf.maximum(state["jobs"], curr_jobs)
state["jobs_from_worker"] = tf.maximum(state["jobs_from_worker"],
curr_jobs_from_worker)
state["workers"] = tf.maximum(state["workers"], curr_workers)
state["workers_from_job"] = tf.maximum(state["workers_from_job"],
curr_workers_from_job)
return state, curr_workers
state, _ = tf.while_loop(
_has_active_workers,
_augment_step, (state, init_workers),
back_prop=False)
# Compute new jobs, this is useful for determnining termnination of the
# maximum bi-partite matching and initialization for backtracking.
new_jobs = (state["jobs"] > 0) & unassigned_jobs
state["new_jobs"] = new_jobs[:, 0, :]
return state
def _improve_assignment(assignment, state):
"""Improves an assignment by backtracking the augmented path using state.
Args:
assignment: A bool [batch_size, num_elems, num_elems] tensor, where each
element of the inner matrix represents whether the worker has been matched
to the job. This may be a partial assignment.
state: A dict, which represents the outcome of running an augmenting path
search on the graph given the assignment.
Returns:
A new assignment matrix of the same shape and type as assignment, where the
assignment has been updated using the augmented path found.
"""
batch_size, num_elems, _ = tf_utils.get_shape_list(assignment, 3)
# We store the current job id and iteratively backtrack using jobs_from_worker
# and workers_from_job until we reach an unassigned worker. We flip all the
# assignments on this path to discover a better overall assignment.
# Note: The indices in state are 1-indexed, where 0 represents that the
# worker / job cannot be reached.
# Obtain initial job indices based on new_jobs.
curr_job_idx = tf.argmax(
tf.cast(state["new_jobs"], tf.int32), axis=1, output_type=tf.int32)
# Track whether an example is actively being backtracked. Since we are
# operating on a batch, not all examples in the batch may be active.
active = tf.gather(state["new_jobs"], curr_job_idx, batch_dims=1)
batch_range = tf.range(0, batch_size, dtype=tf.int32)
# Flip matrix tracks which assignments we need to flip - corresponding to the
# augmenting path taken. We use an integer tensor here so that we can use
# tensor_scatter_nd_add to update the tensor, and then cast it back to bool
# after the loop.
flip_matrix = tf.zeros((batch_size, num_elems, num_elems), dtype=tf.int32)
def _has_active_backtracks(flip_matrix, active, curr_job_idx):
"""Check if there are still active workers."""
del flip_matrix, curr_job_idx
return tf.reduce_any(active)
def _backtrack_one_step(flip_matrix, active, curr_job_idx):
"""Take one step in backtracking."""
# Discover the worker that the job originated from, note that this worker
# must exist by construction.
curr_worker_idx = tf.gather(
state["jobs_from_worker"], curr_job_idx, batch_dims=1) - 1
curr_worker_idx = tf.maximum(curr_worker_idx, 0)
update_indices = tf.stack([batch_range, curr_worker_idx, curr_job_idx],
axis=1)
update_indices = tf.maximum(update_indices, 0)
flip_matrix = tf.tensor_scatter_nd_add(flip_matrix, update_indices,
tf.cast(active, tf.int32))
# Discover the (potential) job that the worker originated from.
curr_job_idx = tf.gather(
state["workers_from_job"], curr_worker_idx, batch_dims=1) - 1
# Note that jobs may not be active, and we track that here (before
# adjusting indices so that they are all >= 0 for gather).
active &= curr_job_idx >= 0
curr_job_idx = tf.maximum(curr_job_idx, 0)
update_indices = tf.stack([batch_range, curr_worker_idx, curr_job_idx],
axis=1)
update_indices = tf.maximum(update_indices, 0)
flip_matrix = tf.tensor_scatter_nd_add(flip_matrix, update_indices,
tf.cast(active, tf.int32))
return flip_matrix, active, curr_job_idx
flip_matrix, _, _ = tf.while_loop(
_has_active_backtracks,
_backtrack_one_step, (flip_matrix, active, curr_job_idx),
back_prop=False)
flip_matrix = tf.cast(flip_matrix, tf.bool)
assignment = tf.math.logical_xor(assignment, flip_matrix)
return assignment
def _maximum_bipartite_matching(adj_matrix, assignment=None):
"""Performs maximum bipartite matching using augmented paths.
Args:
adj_matrix: A bool [batch_size, num_elems, num_elems] tensor, where each
element of the inner matrix represents whether the worker (row) can be
matched to the job (column).
assignment: An optional bool [batch_size, num_elems, num_elems] tensor,
where each element of the inner matrix represents whether the worker has
been matched to the job. This may be a partial assignment. If specified,
this assignment will be used to seed the iterative algorithm.
Returns:
A state dict representing the final augmenting path state search, and
a maximum bipartite matching assignment tensor. Note that the state outcome
can be used to compute a minimum vertex cover for the bipartite graph.
"""
if assignment is None:
assignment = _greedy_assignment(adj_matrix)
state = _find_augmenting_path(assignment, adj_matrix)
def _has_new_jobs(state, assignment):
del assignment
return tf.reduce_any(state["new_jobs"])
def _improve_assignment_and_find_new_path(state, assignment):
assignment = _improve_assignment(assignment, state)
state = _find_augmenting_path(assignment, adj_matrix)
return state, assignment
state, assignment = tf.while_loop(
_has_new_jobs,
_improve_assignment_and_find_new_path, (state, assignment),
back_prop=False)
return state, assignment
def _compute_cover(state, assignment):
"""Computes a cover for the bipartite graph.
We compute a cover using the construction provided at
https://en.wikipedia.org/wiki/K%C5%91nig%27s_theorem_(graph_theory)#Proof
which uses the outcome from the alternating path search.
Args:
state: A state dict, which represents the outcome of running an augmenting
path search on the graph given the assignment.
assignment: An optional bool [batch_size, num_elems, num_elems] tensor,
where each element of the inner matrix represents whether the worker has
been matched to the job. This may be a partial assignment. If specified,
this assignment will be used to seed the iterative algorithm.
Returns:
A tuple of (workers_cover, jobs_cover) corresponding to row and column
covers for the bipartite graph. workers_cover is a boolean tensor of shape
[batch_size, num_elems, 1] and jobs_cover is a boolean tensor of shape
[batch_size, 1, num_elems].
"""
assigned_workers = tf.reduce_any(assignment, axis=2, keepdims=True)
assigned_jobs = tf.reduce_any(assignment, axis=1, keepdims=True)
reachable_workers = state["workers"] > 0
reachable_jobs = state["jobs"] > 0
workers_cover = assigned_workers & (~reachable_workers)
jobs_cover = assigned_jobs & reachable_jobs
return workers_cover, jobs_cover
def _update_weights_using_cover(workers_cover, jobs_cover, weights):
"""Updates weights for hungarian matching using a cover.
We first find the minimum uncovered weight. Then, we subtract this from all
the uncovered weights, and add it to all the doubly covered weights.
Args:
workers_cover: A boolean tensor of shape [batch_size, num_elems, 1].
jobs_cover: A boolean tensor of shape [batch_size, 1, num_elems].
weights: A float32 [batch_size, num_elems, num_elems] tensor, where each
inner matrix represents weights to be use for matching.
Returns:
A new weight matrix with elements adjusted by the cover.
"""
max_value = tf.reduce_max(weights)
covered = workers_cover | jobs_cover
double_covered = workers_cover & jobs_cover
uncovered_weights = tf.where(covered,
tf.ones_like(weights) * max_value, weights)
min_weight = tf.reduce_min(uncovered_weights, axis=[-2, -1], keepdims=True)
add_weight = tf.where(double_covered,
tf.ones_like(weights) * min_weight,
tf.zeros_like(weights))
sub_weight = tf.where(covered, tf.zeros_like(weights),
tf.ones_like(weights) * min_weight)
return weights + add_weight - sub_weight
def assert_rank(tensor, expected_rank, name=None):
"""Raises an exception if the tensor rank is not of the expected rank.
Args:
tensor: A tf.Tensor to check the rank of.
expected_rank: Python integer or list of integers, expected rank.
name: Optional name of the tensor for the error message.
Raises:
ValueError: If the expected shape doesn't match the actual shape.
"""
expected_rank_dict = {}
if isinstance(expected_rank, int):
expected_rank_dict[expected_rank] = True
else:
for x in expected_rank:
expected_rank_dict[x] = True
actual_rank = len(tensor.shape)
if actual_rank not in expected_rank_dict:
raise ValueError(
"For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
"equal to the expected tensor rank `%s`" %
(name, actual_rank, str(tensor.shape), str(expected_rank)))
def hungarian_matching(weights):
"""Computes the minimum linear sum assignment using the Hungarian algorithm.
Args:
weights: A float32 [batch_size, num_elems, num_elems] tensor, where each
inner matrix represents weights to be use for matching.
Returns:
A bool [batch_size, num_elems, num_elems] tensor, where each element of the
inner matrix represents whether the worker has been matched to the job.
The returned matching will always be a perfect match.
"""
batch_size, num_elems, _ = tf_utils.get_shape_list(weights, 3)
weights = _prepare(weights)
adj_matrix = tf.equal(weights, 0.)
state, assignment = _maximum_bipartite_matching(adj_matrix)
workers_cover, jobs_cover = _compute_cover(state, assignment)
def _cover_incomplete(workers_cover, jobs_cover, *args):
del args
cover_sum = (
tf.reduce_sum(tf.cast(workers_cover, tf.int32)) +
tf.reduce_sum(tf.cast(jobs_cover, tf.int32)))
return tf.less(cover_sum, batch_size * num_elems)
def _update_weights_and_match(workers_cover, jobs_cover, weights, assignment):
weights = _update_weights_using_cover(workers_cover, jobs_cover, weights)
adj_matrix = tf.equal(weights, 0.)
state, assignment = _maximum_bipartite_matching(adj_matrix, assignment)
workers_cover, jobs_cover = _compute_cover(state, assignment)
return workers_cover, jobs_cover, weights, assignment
workers_cover, jobs_cover, weights, assignment = tf.while_loop(
_cover_incomplete,
_update_weights_and_match,
(workers_cover, jobs_cover, weights, assignment),
back_prop=False)
return weights, assignment
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for tensorflow_models.official.projects.detr.ops.matchers."""
import numpy as np
from scipy import optimize
import tensorflow as tf
from official.projects.detr.ops import matchers
class MatchersOpsTest(tf.test.TestCase):
def testLinearSumAssignment(self):
"""Check a simple 2D test case of the Linear Sum Assignment problem.
Ensures that the implementation of the matching algorithm is correct
and functional on TPUs.
"""
cost_matrix = np.array([[[4, 1, 3], [2, 0, 5], [3, 2, 2]]],
dtype=np.float32)
_, adjacency_matrix = matchers.hungarian_matching(tf.constant(cost_matrix))
adjacency_output = adjacency_matrix.numpy()
correct_output = np.array([
[0, 1, 0],
[1, 0, 0],
[0, 0, 1],
], dtype=bool)
self.assertAllEqual(adjacency_output[0], correct_output)
def testBatchedLinearSumAssignment(self):
"""Check a batched case of the Linear Sum Assignment Problem.
Ensures that a correct solution is found for all inputted problems within
a batch.
"""
cost_matrix = np.array([
[[4, 1, 3], [2, 0, 5], [3, 2, 2]],
[[1, 4, 3], [0, 2, 5], [2, 3, 2]],
[[1, 3, 4], [0, 5, 2], [2, 2, 3]],
],
dtype=np.float32)
_, adjacency_matrix = matchers.hungarian_matching(tf.constant(cost_matrix))
adjacency_output = adjacency_matrix.numpy()
# Hand solved correct output for the linear sum assignment problem
correct_output = np.array([
[[0, 1, 0], [1, 0, 0], [0, 0, 1]],
[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
[[1, 0, 0], [0, 0, 1], [0, 1, 0]],
],
dtype=bool)
self.assertAllClose(adjacency_output, correct_output)
def testMaximumBipartiteMatching(self):
"""Check that the maximum bipartite match assigns the correct numbers."""
adj_matrix = tf.cast([[
[1, 0, 0, 0, 1],
[0, 1, 0, 1, 0],
[0, 0, 1, 0, 0],
[0, 1, 0, 0, 0],
[1, 0, 0, 0, 0],
]], tf.bool)
_, assignment = matchers._maximum_bipartite_matching(adj_matrix)
self.assertEqual(np.sum(assignment.numpy()), 5)
def testAssignmentMatchesScipy(self):
"""Check that the Linear Sum Assignment matches the Scipy implementation."""
batch_size, num_elems = 2, 25
weights = tf.random.uniform((batch_size, num_elems, num_elems),
minval=0.,
maxval=1.)
weights, assignment = matchers.hungarian_matching(weights)
for idx in range(batch_size):
_, scipy_assignment = optimize.linear_sum_assignment(weights.numpy()[idx])
hungarian_assignment = np.where(assignment.numpy()[idx])[1]
self.assertAllEqual(hungarian_assignment, scipy_assignment)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Customized optimizer to match paper results."""
import dataclasses
import tensorflow as tf
from official.modeling import optimization
from official.nlp import optimization as nlp_optimization
@dataclasses.dataclass
class DETRAdamWConfig(optimization.AdamWeightDecayConfig):
pass
@dataclasses.dataclass
class OptimizerConfig(optimization.OptimizerConfig):
detr_adamw: DETRAdamWConfig = DETRAdamWConfig()
@dataclasses.dataclass
class OptimizationConfig(optimization.OptimizationConfig):
"""Configuration for optimizer and learning rate schedule.
Attributes:
optimizer: optimizer oneof config.
ema: optional exponential moving average optimizer config, if specified, ema
optimizer will be used.
learning_rate: learning rate oneof config.
warmup: warmup oneof config.
"""
optimizer: OptimizerConfig = OptimizerConfig()
# TODO(frederickliu): figure out how to make this configuable.
# TODO(frederickliu): Study if this is needed.
class _DETRAdamW(nlp_optimization.AdamWeightDecay):
"""Custom AdamW to support different lr scaling for backbone.
The code is copied from AdamWeightDecay and Adam with learning scaling.
"""
def _resource_apply_dense(self, grad, var, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
apply_state = kwargs['apply_state']
if 'detr' not in var.name:
lr_t *= 0.1
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
m = self.get_slot(var, 'm')
v = self.get_slot(var, 'v')
lr = coefficients[
'lr_t'] * 0.1 if 'detr' not in var.name else coefficients['lr_t']
if not self.amsgrad:
return tf.raw_ops.ResourceApplyAdam(
var=var.handle,
m=m.handle,
v=v.handle,
beta1_power=coefficients['beta_1_power'],
beta2_power=coefficients['beta_2_power'],
lr=lr,
beta1=coefficients['beta_1_t'],
beta2=coefficients['beta_2_t'],
epsilon=coefficients['epsilon'],
grad=grad,
use_locking=self._use_locking)
else:
vhat = self.get_slot(var, 'vhat')
return tf.raw_ops.ResourceApplyAdamWithAmsgrad(
var=var.handle,
m=m.handle,
v=v.handle,
vhat=vhat.handle,
beta1_power=coefficients['beta_1_power'],
beta2_power=coefficients['beta_2_power'],
lr=lr,
beta1=coefficients['beta_1_t'],
beta2=coefficients['beta_2_t'],
epsilon=coefficients['epsilon'],
grad=grad,
use_locking=self._use_locking)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
apply_state = kwargs['apply_state']
if 'detr' not in var.name:
lr_t *= 0.1
decay = self._decay_weights_op(var, lr_t, apply_state)
with tf.control_dependencies([decay]):
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, 'm')
m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
m_t = tf.compat.v1.assign(m, m * coefficients['beta_1_t'],
use_locking=self._use_locking)
with tf.control_dependencies([m_t]):
m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, 'v')
v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
v_t = tf.compat.v1.assign(v, v * coefficients['beta_2_t'],
use_locking=self._use_locking)
with tf.control_dependencies([v_t]):
v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
lr = coefficients[
'lr_t'] * 0.1 if 'detr' not in var.name else coefficients['lr_t']
if not self.amsgrad:
v_sqrt = tf.sqrt(v_t)
var_update = tf.compat.v1.assign_sub(
var, lr * m_t / (v_sqrt + coefficients['epsilon']),
use_locking=self._use_locking)
return tf.group(*[var_update, m_t, v_t])
else:
v_hat = self.get_slot(var, 'vhat')
v_hat_t = tf.maximum(v_hat, v_t)
with tf.control_dependencies([v_hat_t]):
v_hat_t = tf.compat.v1.assign(
v_hat, v_hat_t, use_locking=self._use_locking)
v_hat_sqrt = tf.sqrt(v_hat_t)
var_update = tf.compat.v1.assign_sub(
var,
lr* m_t / (v_hat_sqrt + coefficients['epsilon']),
use_locking=self._use_locking)
return tf.group(*[var_update, m_t, v_t, v_hat_t])
optimization.register_optimizer_cls('detr_adamw', _DETRAdamW)
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DETR detection task definition."""
import tensorflow as tf
from official.core import base_task
from official.core import task_factory
from official.projects.detr.configs import detr as detr_cfg
from official.projects.detr.dataloaders import coco
from official.projects.detr.modeling import detr
from official.projects.detr.ops import matchers
from official.vision.beta.evaluation import coco_evaluator
from official.vision.beta.ops import box_ops
@task_factory.register_task_cls(detr_cfg.DetectionConfig)
class DectectionTask(base_task.Task):
"""A single-replica view of training procedure.
DETR task provides artifacts for training/evalution procedures, including
loading/iterating over Datasets, initializing the model, calculating the loss,
post-processing, and customized metrics with reduction.
"""
def build_model(self):
"""Build DETR model."""
model = detr.DETR(
self._task_config.num_queries,
self._task_config.num_hidden,
self._task_config.num_classes,
self._task_config.num_encoder_layers,
self._task_config.num_decoder_layers)
return model
def initialize(self, model: tf.keras.Model):
"""Loading pretrained checkpoint."""
ckpt = tf.train.Checkpoint(backbone=model.backbone)
status = ckpt.read(self._task_config.init_ckpt)
status.expect_partial().assert_existing_objects_matched()
def build_inputs(self, params, input_context=None):
"""Build input dataset."""
return coco.COCODataLoader(params).load(input_context)
def _compute_cost(self, cls_outputs, box_outputs, cls_targets, box_targets):
# Approximate classification cost with 1 - prob[target class].
# The 1 is a constant that doesn't change the matching, it can be ommitted.
# background: 0
cls_cost = self._task_config.lambda_cls * tf.gather(
-tf.nn.softmax(cls_outputs), cls_targets, batch_dims=1, axis=-1)
# Compute the L1 cost between boxes,
paired_differences = self._task_config.lambda_box * tf.abs(
tf.expand_dims(box_outputs, 2) - tf.expand_dims(box_targets, 1))
box_cost = tf.reduce_sum(paired_differences, axis=-1)
# Compute the giou cost betwen boxes
giou_cost = self._task_config.lambda_giou * -box_ops.bbox_generalized_overlap(
box_ops.cycxhw_to_yxyx(box_outputs),
box_ops.cycxhw_to_yxyx(box_targets))
total_cost = cls_cost + box_cost + giou_cost
max_cost = (
self._task_config.lambda_cls * 0.0 + self._task_config.lambda_box * 4. +
self._task_config.lambda_giou * 0.0)
# Set pads to large constant
valid = tf.expand_dims(
tf.cast(tf.not_equal(cls_targets, 0), dtype=total_cost.dtype), axis=1)
total_cost = (1 - valid) * max_cost + valid * total_cost
# Set inf of nan to large constant
total_cost = tf.where(
tf.logical_or(tf.math.is_nan(total_cost), tf.math.is_inf(total_cost)),
max_cost * tf.ones_like(total_cost, dtype=total_cost.dtype),
total_cost)
return total_cost
def build_losses(self, outputs, labels, aux_losses=None):
"""Build DETR losses."""
cls_outputs = outputs['cls_outputs']
box_outputs = outputs['box_outputs']
cls_targets = labels['classes']
box_targets = labels['boxes']
cost = self._compute_cost(
cls_outputs, box_outputs, cls_targets, box_targets)
_, indices = matchers.hungarian_matching(cost)
indices = tf.stop_gradient(indices)
target_index = tf.math.argmax(indices, axis=1)
cls_assigned = tf.gather(cls_outputs, target_index, batch_dims=1, axis=1)
box_assigned = tf.gather(box_outputs, target_index, batch_dims=1, axis=1)
background = tf.equal(cls_targets, 0)
num_boxes = tf.reduce_sum(
tf.cast(tf.logical_not(background), tf.float32), axis=-1)
# Down-weight background to account for class imbalance.
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=cls_targets, logits=cls_assigned)
cls_loss = self._task_config.lambda_cls * tf.where(
background,
self._task_config.background_cls_weight * xentropy,
xentropy
)
cls_weights = tf.where(
background,
self._task_config.background_cls_weight * tf.ones_like(cls_loss),
tf.ones_like(cls_loss)
)
# Box loss is only calculated on non-background class.
l_1 = tf.reduce_sum(tf.abs(box_assigned - box_targets), axis=-1)
box_loss = self._task_config.lambda_box * tf.where(
background,
tf.zeros_like(l_1),
l_1
)
# Giou loss is only calculated on non-background class.
giou = tf.linalg.diag_part(1.0 - box_ops.bbox_generalized_overlap(
box_ops.cycxhw_to_yxyx(box_assigned),
box_ops.cycxhw_to_yxyx(box_targets)
))
giou_loss = self._task_config.lambda_giou * tf.where(
background,
tf.zeros_like(giou),
giou
)
# Consider doing all reduce once in train_step to speed up.
num_boxes_per_replica = tf.reduce_sum(num_boxes)
cls_weights_per_replica = tf.reduce_sum(cls_weights)
replica_context = tf.distribute.get_replica_context()
num_boxes_sum, cls_weights_sum = replica_context.all_reduce(
tf.distribute.ReduceOp.SUM,
[num_boxes_per_replica, cls_weights_per_replica])
cls_loss = tf.math.divide_no_nan(
tf.reduce_sum(cls_loss), cls_weights_sum)
box_loss = tf.math.divide_no_nan(
tf.reduce_sum(box_loss), num_boxes_sum)
giou_loss = tf.math.divide_no_nan(
tf.reduce_sum(giou_loss), num_boxes_sum)
aux_losses = tf.add_n(aux_losses) if aux_losses else 0.0
total_loss = cls_loss + box_loss + giou_loss + aux_losses
return total_loss, cls_loss, box_loss, giou_loss
def build_metrics(self, training=True):
"""Build detection metrics."""
metrics = []
metric_names = ['cls_loss', 'box_loss', 'giou_loss']
for name in metric_names:
metrics.append(tf.keras.metrics.Mean(name, dtype=tf.float32))
if not training:
self.coco_metric = coco_evaluator.COCOEvaluator(
annotation_file='',
include_mask=False,
need_rescale_bboxes=True,
per_category_metrics=self._task_config.per_category_metrics)
return metrics
def train_step(self, inputs, model, optimizer, metrics=None):
"""Does forward and backward.
Args:
inputs: a dictionary of input tensors.
model: the model, forward pass definition.
optimizer: the optimizer for this training step.
metrics: a nested structure of metrics objects.
Returns:
A dictionary of logs.
"""
features, labels = inputs
with tf.GradientTape() as tape:
outputs = model(features, training=True)
loss = 0.0
cls_loss = 0.0
box_loss = 0.0
giou_loss = 0.0
for output in outputs:
# Computes per-replica loss.
layer_loss, layer_cls_loss, layer_box_loss, layer_giou_loss = self.build_losses(
outputs=output, labels=labels, aux_losses=model.losses)
loss += layer_loss
cls_loss += layer_cls_loss
box_loss += layer_box_loss
giou_loss += layer_giou_loss
# Consider moving scaling logic from build_losses to here.
scaled_loss = loss
# For mixed_precision policy, when LossScaleOptimizer is used, loss is
# scaled for numerical stability.
if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
scaled_loss = optimizer.get_scaled_loss(scaled_loss)
tvars = model.trainable_variables
grads = tape.gradient(scaled_loss, tvars)
# Scales back gradient when LossScaleOptimizer is used.
if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
grads = optimizer.get_unscaled_gradients(grads)
optimizer.apply_gradients(list(zip(grads, tvars)))
# Multiply for logging.
# Since we expect the gradient replica sum to happen in the optimizer,
# the loss is scaled with global num_boxes and weights.
# To have it more interpretable/comparable we scale it back when logging.
num_replicas_in_sync = tf.distribute.get_strategy().num_replicas_in_sync
loss *= num_replicas_in_sync
cls_loss *= num_replicas_in_sync
box_loss *= num_replicas_in_sync
giou_loss *= num_replicas_in_sync
# Trainer class handles loss metric for you.
logs = {self.loss: loss}
all_losses = {
'cls_loss': cls_loss,
'box_loss': box_loss,
'giou_loss': giou_loss,
}
# Metric results will be added to logs for you.
if metrics:
for m in metrics:
m.update_state(all_losses[m.name])
return logs
def validation_step(self, inputs, model, metrics=None):
"""Validatation step.
Args:
inputs: a dictionary of input tensors.
model: the keras.Model.
metrics: a nested structure of metrics objects.
Returns:
A dictionary of logs.
"""
features, labels = inputs
outputs = model(features, training=False)[-1]
loss, cls_loss, box_loss, giou_loss = self.build_losses(
outputs=outputs, labels=labels, aux_losses=model.losses)
# Multiply for logging.
# Since we expect the gradient replica sum to happen in the optimizer,
# the loss is scaled with global num_boxes and weights.
# To have it more interpretable/comparable we scale it back when logging.
num_replicas_in_sync = tf.distribute.get_strategy().num_replicas_in_sync
loss *= num_replicas_in_sync
cls_loss *= num_replicas_in_sync
box_loss *= num_replicas_in_sync
giou_loss *= num_replicas_in_sync
# Evaluator class handles loss metric for you.
logs = {self.loss: loss}
predictions = {
'detection_boxes':
box_ops.cycxhw_to_yxyx(outputs['box_outputs'])
* tf.expand_dims(
tf.concat([
labels['image_info'][:, 1:2, 0],
labels['image_info'][:, 1:2, 1],
labels['image_info'][:, 1:2, 0],
labels['image_info'][:, 1:2, 1]
],
axis=1),
axis=1),
'detection_scores':
tf.math.reduce_max(
tf.nn.softmax(outputs['cls_outputs'])[:, :, 1:], axis=-1),
'detection_classes':
tf.math.argmax(outputs['cls_outputs'][:, :, 1:], axis=-1) + 1,
# Fix this. It's not being used at the moment.
'num_detections': tf.reduce_sum(
tf.cast(
tf.math.greater(tf.math.reduce_max(
outputs['cls_outputs'], axis=-1), 0), tf.int32), axis=-1),
'source_id': labels['id'],
'image_info': labels['image_info']
}
ground_truths = {
'source_id': labels['id'],
'height': labels['image_info'][:, 0:1, 0],
'width': labels['image_info'][:, 0:1, 1],
'num_detections': tf.reduce_sum(
tf.cast(tf.math.greater(labels['classes'], 0), tf.int32), axis=-1),
'boxes': labels['gt_boxes'],
'classes': labels['classes'],
'is_crowds': labels['is_crowd']
}
logs.update({'predictions': predictions,
'ground_truths': ground_truths})
all_losses = {
'cls_loss': cls_loss,
'box_loss': box_loss,
'giou_loss': giou_loss,
}
# Metric results will be added to logs for you.
if metrics:
for m in metrics:
m.update_state(all_losses[m.name])
return logs
def aggregate_logs(self, state=None, step_outputs=None):
if state is None:
self.coco_metric.reset_states()
state = self.coco_metric
state.update_state(
step_outputs['ground_truths'],
step_outputs['predictions'])
return state
def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
return aggregated_logs.result()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for detection."""
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from official.projects.detr import optimization
from official.projects.detr.configs import detr as detr_cfg
from official.projects.detr.dataloaders import coco
from official.projects.detr.tasks import detection
_NUM_EXAMPLES = 10
def _gen_fn():
h = np.random.randint(0, 300)
w = np.random.randint(0, 300)
num_boxes = np.random.randint(0, 50)
return {
'image': np.ones(shape=(h, w, 3), dtype=np.uint8),
'image/id': np.random.randint(0, 100),
'image/filename': 'test',
'objects': {
'is_crowd': np.ones(shape=(num_boxes), dtype=np.bool),
'bbox': np.ones(shape=(num_boxes, 4), dtype=np.float32),
'label': np.ones(shape=(num_boxes), dtype=np.int64),
'id': np.ones(shape=(num_boxes), dtype=np.int64),
'area': np.ones(shape=(num_boxes), dtype=np.int64),
}
}
def _as_dataset(self, *args, **kwargs):
del args
del kwargs
return tf.data.Dataset.from_generator(
lambda: (_gen_fn() for i in range(_NUM_EXAMPLES)),
output_types=self.info.features.dtype,
output_shapes=self.info.features.shape,
)
class DetectionTest(tf.test.TestCase):
def test_train_step(self):
config = detr_cfg.DetectionConfig(
num_encoder_layers=1,
num_decoder_layers=1,
train_data=coco.COCODataConfig(
tfds_name='coco/2017',
tfds_split='validation',
is_training=True,
global_batch_size=2,
))
with tfds.testing.mock_data(as_dataset_fn=_as_dataset):
task = detection.DectectionTask(config)
model = task.build_model()
dataset = task.build_inputs(config.train_data)
iterator = iter(dataset)
opt_cfg = optimization.OptimizationConfig({
'optimizer': {
'type': 'detr_adamw',
'detr_adamw': {
'weight_decay_rate': 1e-4,
'global_clipnorm': 0.1,
}
},
'learning_rate': {
'type': 'stepwise',
'stepwise': {
'boundaries': [120000],
'values': [0.0001, 1.0e-05]
}
},
})
optimizer = detection.DectectionTask.create_optimizer(opt_cfg)
task.train_step(next(iterator), model, optimizer)
def test_validation_step(self):
config = detr_cfg.DetectionConfig(
num_encoder_layers=1,
num_decoder_layers=1,
validation_data=coco.COCODataConfig(
tfds_name='coco/2017',
tfds_split='validation',
is_training=False,
global_batch_size=2,
))
with tfds.testing.mock_data(as_dataset_fn=_as_dataset):
task = detection.DectectionTask(config)
model = task.build_model()
metrics = task.build_metrics(training=False)
dataset = task.build_inputs(config.validation_data)
iterator = iter(dataset)
logs = task.validation_step(next(iterator), model, metrics)
state = task.aggregate_logs(step_outputs=logs)
task.reduce_aggregated_logs(state)
if __name__ == '__main__':
tf.test.main()
# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TensorFlow Model Garden Vision training driver."""
from absl import app
from absl import flags
import gin
from official.common import distribute_utils
from official.common import flags as tfm_flags
from official.core import task_factory
from official.core import train_lib
from official.core import train_utils
from official.modeling import performance
# pylint: disable=unused-import
from official.projects.detr.configs import detr
from official.projects.detr.tasks import detection
# pylint: enable=unused-import
FLAGS = flags.FLAGS
def main(_):
gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
params = train_utils.parse_configuration(FLAGS)
model_dir = FLAGS.model_dir
if 'train' in FLAGS.mode:
# Pure eval modes do not output yaml files. Otherwise continuous eval job
# may race against the train job for writing the same file.
train_utils.serialize_config(params, model_dir)
# Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
# can have significant impact on model speeds by utilizing float16 in case of
# GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
# dtype is float16
if params.runtime.mixed_precision_dtype:
performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
distribution_strategy = distribute_utils.get_distribution_strategy(
distribution_strategy=params.runtime.distribution_strategy,
all_reduce_alg=params.runtime.all_reduce_alg,
num_gpus=params.runtime.num_gpus,
tpu_address=params.runtime.tpu)
with distribution_strategy.scope():
task = task_factory.get_task(params.task, logging_dir=model_dir)
train_lib.run_experiment(
distribution_strategy=distribution_strategy,
task=task,
mode=FLAGS.mode,
params=params,
model_dir=model_dir)
train_utils.save_gin_config(FLAGS.mode, model_dir)
if __name__ == '__main__':
tfm_flags.define_flags()
flags.mark_flags_as_required(['experiment', 'mode', 'model_dir'])
app.run(main)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment