Commit 1d077fe6 authored by Vighnesh Birodkar's avatar Vighnesh Birodkar Committed by TF Object Detection Team
Browse files

Open source DeepMAC architecture.

PiperOrigin-RevId: 366382220
parent ec543191
......@@ -73,6 +73,30 @@ documentation of the Object Detection API:
## Whats New
### DeepMAC architecture
We have released our new architecture, **DeepMAC**, desgined for partially
supervised instance segmentation. DeepMAC stands for Deep Mask-heads
Above CenterNet, and is based on our CenterNet implementation. In our
[paper](https://arxiv.org/abs/2104.00613) we show that DeepMAC achieves
state-of-the-art results for the partially supervised instance segmentation
task without using any specialty modules or losses; just better mask-head
architectures. The findings from our paper are not specific to CenterNet and
can also be applied to Mask R-CNN or without any detector at all.
Please see links below for more details
* [DeepMAC documentation](g3doc/deepmac.md).
* [Mask RCNN code](https://github.com/tensorflow/models/tree/master/official/vision/beta/projects/deepmac_maskrcnn)
in TF Model garden code base.
* [DeepMAC Colab](../colab_tutorials/deepmac_colab.ipynb) that lets you run a
pre-trained DeepMAC model on user-specified boxes. Note that you are not
restricted to COCO classes!
* Project website - [git.io/deepmac](https://git.io/deepmac)
<b>Thanks to contributors</b>: Vighnesh Birodkar, Zhichao Lu, Siyang Li,
Vivek Rathod, Jonathan Huang
### Mobile Inference for TF2 models
TF2 OD API models can now be converted to TensorFlow Lite! Only SSD models
......
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "deepmac_demo.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "P-esW81yhfCN"
},
"source": [
"# Novel class segmentation demo with Deep-MAC\n",
"\n",
"Welcome to the Novel class segmentation (with Deep-MAC) demo --- this colab loads a Deep-MAC model and tests it interactively with user-specified boxes. Deep-MAC was only trained to detect and segment COCO classes, but generalizes well when segmenting within user-specified boxes of unseen classes.\n",
"\n",
"Estimated time to run through this colab (with GPU): 10-15 minutes.\n",
"Note that the bulk of this time is in installing Tensorflow and downloading\n",
"the checkpoint then running inference for the first time. Once you've done\n",
"all that, running on new images is very fast."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Kq1eGNssiW31"
},
"source": [
"# Prerequisites\n",
"\n",
"Please change runtime to GPU."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UT7N0HJhiRKr"
},
"source": [
"# Installation and Imports\n",
"\n",
"This takes 3-4 minutes."
]
},
{
"cell_type": "code",
"metadata": {
"id": "nNdls0Pe0UPK"
},
"source": [
"!pip install -U --pre tensorflow==\"2.2.0\"\n",
"\n",
"import os\n",
"import pathlib\n",
"\n",
"# Clone the tensorflow models repository if it doesn't already exist\n",
"if \"models\" in pathlib.Path.cwd().parts:\n",
" while \"models\" in pathlib.Path.cwd().parts:\n",
" os.chdir('..')\n",
"elif not pathlib.Path('models').exists():\n",
" !git clone --depth 1 https://github.com/tensorflow/models\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "WwjV9clX0n7S"
},
"source": [
"# Install the Object Detection API\n",
"%%bash\n",
"cd models/research/\n",
"protoc object_detection/protos/*.proto --python_out=.\n",
"cp object_detection/packages/tf2/setup.py .\n",
"python -m pip install ."
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "sfrrno2L0sRR"
},
"source": [
"import glob\n",
"import io\n",
"import logging\n",
"import os\n",
"import random\n",
"import warnings\n",
"\n",
"import imageio\n",
"from IPython.display import display, Javascript\n",
"from IPython.display import Image as IPyImage\n",
"import matplotlib\n",
"from matplotlib import patches\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from object_detection.utils import colab_utils\n",
"from object_detection.utils import ops\n",
"from object_detection.utils import visualization_utils as viz_utils\n",
"from PIL import Image, ImageDraw, ImageFont\n",
"import scipy.misc\n",
"from six import BytesIO\n",
"from skimage import color\n",
"from skimage import transform\n",
"from skimage import util\n",
"from skimage.color import rgb_colors\n",
"import tensorflow as tf\n",
"\n",
"%matplotlib inline\n",
"\n",
"COLORS = ([rgb_colors.cyan, rgb_colors.orange, rgb_colors.pink,\n",
" rgb_colors.purple, rgb_colors.limegreen , rgb_colors.crimson] +\n",
" [(color) for (name, color) in color.color_dict.items()])\n",
"random.shuffle(COLORS)\n",
"\n",
"logging.disable(logging.WARNING)\n",
"\n",
"\n",
"def read_image(path):\n",
" \"\"\"Read an image and optionally resize it for better plotting.\"\"\"\n",
" with tf.io.gfile.GFile(path, 'rb') as f:\n",
" img = Image.open(f)\n",
" return np.array(img, dtype=np.uint8)\n",
"\n",
"\n",
"def resize_for_display(image, max_height=600):\n",
" height, width, _ = image.shape\n",
" width = int(width * max_height / height)\n",
" with warnings.catch_warnings():\n",
" warnings.simplefilter(\"ignore\", UserWarning)\n",
" return util.img_as_ubyte(transform.resize(image, (height, width)))\n",
"\n",
"\n",
"def get_mask_prediction_function(model):\n",
" \"\"\"Get single image mask prediction function using a model.\"\"\"\n",
"\n",
" @tf.function\n",
" def predict_masks(image, boxes):\n",
" height, width, _ = image.shape.as_list()\n",
" batch = image[tf.newaxis]\n",
" boxes = boxes[tf.newaxis]\n",
"\n",
" detections = model(batch, boxes)\n",
" masks = detections['detection_masks']\n",
"\n",
" return ops.reframe_box_masks_to_image_masks(masks[0], boxes[0],\n",
" height, width)\n",
"\n",
" return predict_masks\n",
"\n",
"\n",
"def plot_image_annotations(image, boxes, masks, darken_image=0.5):\n",
" fig, ax = plt.subplots(figsize=(16, 12))\n",
" ax.set_axis_off()\n",
" image = (image * darken_image).astype(np.uint8)\n",
" ax.imshow(image)\n",
"\n",
" height, width, _ = image.shape\n",
"\n",
" num_colors = len(COLORS)\n",
" color_index = 0\n",
"\n",
" for box, mask in zip(boxes, masks):\n",
" ymin, xmin, ymax, xmax = box\n",
" ymin *= height\n",
" ymax *= height\n",
" xmin *= width\n",
" xmax *= width\n",
"\n",
" color = COLORS[color_index]\n",
" color = np.array(color)\n",
" rect = patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,\n",
" linewidth=2.5, edgecolor=color, facecolor='none')\n",
" ax.add_patch(rect)\n",
" mask = (mask > 0.5).astype(np.float32)\n",
" color_image = np.ones_like(image) * color[np.newaxis, np.newaxis, :]\n",
" color_and_mask = np.concatenate(\n",
" [color_image, mask[:, :, np.newaxis]], axis=2)\n",
"\n",
" ax.imshow(color_and_mask, alpha=0.5)\n",
"\n",
" color_index = (color_index + 1) % num_colors\n",
"\n",
" return ax"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ry9yq8zsi0Gg"
},
"source": [
"# Load Deep-MAC Model\n",
"\n",
"This can take up to 5 minutes."
]
},
{
"cell_type": "code",
"metadata": {
"id": "PZ-wnbYu05K8"
},
"source": [
"print('Downloading and untarring model')\n",
"!wget http://download.tensorflow.org/models/object_detection/tf2/20210329/deepmac_1024x1024_coco17.tar.gz\n",
"!cp deepmac_1024x1024_coco17.tar.gz models/research/object_detection/test_data/\n",
"!tar -xzf models/research/object_detection/test_data/deepmac_1024x1024_coco17.tar.gz\n",
"!mv deepmac_1024x1024_coco17 models/research/object_detection/test_data/\n",
"model_path = 'models/research/object_detection/test_data/deepmac_1024x1024_coco17/saved_model'\n",
"\n",
"print('Loading SavedModel')\n",
"model = tf.keras.models.load_model(model_path)\n",
"prediction_function = get_mask_prediction_function(model)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ilXkYOB_NUSc"
},
"source": [
"# Load image"
]
},
{
"cell_type": "code",
"metadata": {
"id": "txj4UkoDNaOq"
},
"source": [
"image_path = 'models/research/object_detection/test_images/image3.jpg'\n",
"image = read_image(image_path)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "zyhudgYUjcvE"
},
"source": [
"# Annotate an image with one or more boxes\n",
"\n",
"This model is trained on COCO categories, but we encourage you to try segmenting\n",
"anything you want!\n",
"\n",
"Don't forget to hit **submit** when done."
]
},
{
"cell_type": "code",
"metadata": {
"id": "aZvY4At0074j"
},
"source": [
"display_image = resize_for_display(image)\n",
"\n",
"boxes_list = []\n",
"colab_utils.annotate([display_image], boxes_list)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "gUUG7NPBJMoa"
},
"source": [
"# In case you didn't want to label...\n",
"\n",
"Run this cell only if you didn't annotate anything above and would prefer to just use our preannotated boxes. Don't forget to uncomment.\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "lupqTv1HJK5K"
},
"source": [
"# boxes_list = [np.array([[0.000, 0.160, 0.362, 0.812],\n",
"# [0.340, 0.286, 0.472, 0.619],\n",
"# [0.437, 0.008, 0.650, 0.263],\n",
"# [0.382, 0.003, 0.538, 0.594],\n",
"# [0.518, 0.444, 0.625,0.554]], dtype=np.float32)]"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Ak1WO93NjvN-"
},
"source": [
"# Visualize mask predictions"
]
},
{
"cell_type": "code",
"metadata": {
"id": "vdzuKnpj1A3L"
},
"source": [
"%matplotlib inline\n",
"\n",
"boxes = boxes_list[0]\n",
"masks = prediction_function(tf.convert_to_tensor(image),\n",
" tf.convert_to_tensor(boxes, dtype=tf.float32))\n",
"plot_image_annotations(image, boxes, masks.numpy())\n",
"plt.show()"
],
"execution_count": null,
"outputs": []
}
]
}
\ No newline at end of file
# DeepMAC meta architecture from the "The surprising impact of mask-head
# architecture on novel class segmentation" [1] paper with an Hourglass-100[2]
# mask head. This config is trained on all COCO classes and achieves a
# mask mAP of 39.4% on the COCO testdev-2017 set.
# [1]: https://arxiv.org/abs/2104.00613
# [2]: https://arxiv.org/abs/1904.07850
# Train on TPU-128
model {
center_net {
num_classes: 90
feature_extractor {
type: "hourglass_104"
bgr_ordering: true
channel_means: [104.01362025, 114.03422265, 119.9165958 ]
channel_stds: [73.6027665 , 69.89082075, 70.9150767 ]
}
image_resizer {
keep_aspect_ratio_resizer {
min_dimension: 1024
max_dimension: 1024
pad_to_max_dimension: true
}
}
object_detection_task {
task_loss_weight: 1.0
offset_loss_weight: 1.0
scale_loss_weight: 0.1
localization_loss {
l1_localization_loss {
}
}
}
object_center_params {
object_center_loss_weight: 1.0
min_box_overlap_iou: 0.7
max_box_predictions: 100
classification_loss {
penalty_reduced_logistic_focal_loss {
alpha: 2.0
beta: 4.0
}
}
}
deepmac_mask_estimation {
dim: 32
task_loss_weight: 5.0
pixel_embedding_dim: 16
mask_size: 32
use_xy: true
use_instance_embedding: true
network_type: "hourglass100"
classification_loss {
weighted_sigmoid {}
}
}
}
}
train_config: {
batch_size: 128
num_steps: 50000
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
random_adjust_hue {
}
}
data_augmentation_options {
random_adjust_contrast {
}
}
data_augmentation_options {
random_adjust_saturation {
}
}
data_augmentation_options {
random_adjust_brightness {
}
}
data_augmentation_options {
random_square_crop_by_scale {
scale_min: 0.6
scale_max: 1.3
}
}
optimizer {
adam_optimizer: {
epsilon: 1e-7 # Match tf.keras.optimizers.Adam's default.
learning_rate: {
cosine_decay_learning_rate {
learning_rate_base: 1e-3
total_steps: 50000
warmup_learning_rate: 2.5e-4
warmup_steps: 5000
}
}
}
use_moving_average: false
}
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
fine_tune_checkpoint_version: V2
fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/ckpt-51"
fine_tune_checkpoint_type: "fine_tune"
}
train_input_reader: {
load_instance_masks: true
label_map_path: "PATH_TO_BE_CONFIGURED/label_map.txt"
mask_type: PNG_MASKS
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/train2017-?????-of-00256.tfrecord"
}
}
eval_config: {
metrics_set: "coco_detection_metrics"
metrics_set: "coco_mask_metrics"
include_metrics_per_category: true
use_moving_averages: false
batch_size: 1;
super_categories {
key: "VOC"
value: "person,bicycle,car,motorcycle,airplane,bus,train,boat,bird,cat,"
"dog,horse,sheep,cow,bottle,chair,couch,potted plant,dining table,tv"
}
super_categories {
key: "NonVOC"
value: "truck,traffic light,fire hydrant,stop sign,parking meter,bench,"
"elephant,bear,zebra,giraffe,backpack,umbrella,handbag,tie,suitcase,"
"frisbee,skis,snowboard,sports ball,kite,baseball bat,baseball glove,"
"skateboard,surfboard,tennis racket,wine glass,cup,fork,knife,spoon,bowl,"
"banana,apple,sandwich,orange,broccoli,carrot,hot dog,pizza,donut,cake,bed,"
"toilet,laptop,mouse,remote,keyboard,cell phone,microwave,oven,toaster,"
"sink,refrigerator,book,clock,vase,scissors,teddy bear,hair drier,"
"toothbrush"
}
super_categories {
key: "person"
value: "person"
}
super_categories {
key: "vehicle"
value: "bicycle,car,motorcycle,airplane,bus,train,truck,boat"
}
super_categories {
key: "outdoor"
value: "traffic light,fire hydrant,stop sign,parking meter,bench"
}
super_categories {
key: "animal"
value: "bird,cat,dog,horse,sheep,cow,elephant,bear,zebra,giraffe"
}
super_categories {
key: "accessory"
value: "backpack,umbrella,handbag,tie,suitcase"
}
super_categories {
key: "sports"
value: "frisbee,skis,snowboard,sports ball,kite,baseball bat,"
"baseball glove,skateboard,surfboard,tennis racket"
}
super_categories {
key: "kitchen"
value: "bottle,wine glass,cup,fork,knife,spoon,bowl"
}
super_categories {
key: "food"
value: "banana,apple,sandwich,orange,broccoli,carrot,hot dog,pizza,donut,"
"cake"
}
super_categories {
key: "furniture"
value: "chair,couch,potted plant,bed,dining table,toilet"
}
super_categories {
key: "electronic"
value: "tv,laptop,mouse,remote,keyboard,cell phone,microwave,oven,toaster,"
"sink,refrigerator"
}
super_categories {
key: "indoor"
value: "book,clock,vase,scissors,teddy bear,hair drier,toothbrush"
}
}
eval_input_reader: {
load_instance_masks: true
mask_type: PNG_MASKS
label_map_path: "PATH_TO_BE_CONFIGURED/label_map.txt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/val2017-?????-of-00032.tfrecord"
}
}
# DeepMAC meta architecture from the "The surprising impact of mask-head
# architecture on novel class segmentation" [1] paper with an Hourglass-100[2]
# mask head. This config is trained on masks from the non-VOC classes and
# achieves a mask mAP of 39.1% on the VOC classes.
# [1]: https://arxiv.org/abs/2104.00613
# [2]: https://arxiv.org/abs/1904.07850
# Train on TPU-128
model {
center_net {
num_classes: 90
feature_extractor {
type: "hourglass_104"
bgr_ordering: true
channel_means: [104.01362025, 114.03422265, 119.9165958 ]
channel_stds: [73.6027665 , 69.89082075, 70.9150767 ]
}
image_resizer {
keep_aspect_ratio_resizer {
min_dimension: 1024
max_dimension: 1024
pad_to_max_dimension: true
}
}
object_detection_task {
task_loss_weight: 1.0
offset_loss_weight: 1.0
scale_loss_weight: 0.1
localization_loss {
l1_localization_loss {
}
}
}
object_center_params {
object_center_loss_weight: 1.0
min_box_overlap_iou: 0.7
max_box_predictions: 100
classification_loss {
penalty_reduced_logistic_focal_loss {
alpha: 2.0
beta: 4.0
}
}
}
deepmac_mask_estimation {
dim: 32
task_loss_weight: 5.0
pixel_embedding_dim: 16
mask_size: 32
use_xy: true
use_instance_embedding: true
network_type: "hourglass100"
classification_loss {
weighted_sigmoid {}
}
allowed_masked_classes_ids: [
8,
10,
11,
13,
14,
15,
22,
23,
24,
25,
27,
28,
31,
32,
33,
34,
35,
36,
37,
38,
39,
40,
41,
42,
43,
46,
47,
48,
49,
50,
51,
52,
53,
54,
55,
56,
57,
58,
59,
60,
61,
65,
70,
73,
74,
75,
76,
77,
78,
79,
80,
81,
82,
84,
85,
86,
87,
88,
89,
90
]
}
}
}
train_config: {
batch_size: 128
num_steps: 50000
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
random_adjust_hue {
}
}
data_augmentation_options {
random_adjust_contrast {
}
}
data_augmentation_options {
random_adjust_saturation {
}
}
data_augmentation_options {
random_adjust_brightness {
}
}
data_augmentation_options {
random_square_crop_by_scale {
scale_min: 0.6
scale_max: 1.3
}
}
optimizer {
adam_optimizer: {
epsilon: 1e-7 # Match tf.keras.optimizers.Adam's default.
learning_rate: {
cosine_decay_learning_rate {
learning_rate_base: 1e-3
total_steps: 50000
warmup_learning_rate: 2.5e-4
warmup_steps: 5000
}
}
}
use_moving_average: false
}
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
fine_tune_checkpoint_version: V2
fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/ckpt-51"
fine_tune_checkpoint_type: "fine_tune"
}
train_input_reader: {
load_instance_masks: true
label_map_path: "PATH_TO_BE_CONFIGURED/label_map.txt"
mask_type: PNG_MASKS
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/train2017-?????-of-00256.tfrecord"
}
}
eval_config: {
metrics_set: "coco_detection_metrics"
metrics_set: "coco_mask_metrics"
include_metrics_per_category: true
use_moving_averages: false
batch_size: 1;
super_categories {
key: "VOC"
value: "person,bicycle,car,motorcycle,airplane,bus,train,boat,bird,cat,"
"dog,horse,sheep,cow,bottle,chair,couch,potted plant,dining table,tv"
}
super_categories {
key: "NonVOC"
value: "truck,traffic light,fire hydrant,stop sign,parking meter,bench,"
"elephant,bear,zebra,giraffe,backpack,umbrella,handbag,tie,suitcase,"
"frisbee,skis,snowboard,sports ball,kite,baseball bat,baseball glove,"
"skateboard,surfboard,tennis racket,wine glass,cup,fork,knife,spoon,bowl,"
"banana,apple,sandwich,orange,broccoli,carrot,hot dog,pizza,donut,cake,bed,"
"toilet,laptop,mouse,remote,keyboard,cell phone,microwave,oven,toaster,"
"sink,refrigerator,book,clock,vase,scissors,teddy bear,hair drier,"
"toothbrush"
}
super_categories {
key: "person"
value: "person"
}
super_categories {
key: "vehicle"
value: "bicycle,car,motorcycle,airplane,bus,train,truck,boat"
}
super_categories {
key: "outdoor"
value: "traffic light,fire hydrant,stop sign,parking meter,bench"
}
super_categories {
key: "animal"
value: "bird,cat,dog,horse,sheep,cow,elephant,bear,zebra,giraffe"
}
super_categories {
key: "accessory"
value: "backpack,umbrella,handbag,tie,suitcase"
}
super_categories {
key: "sports"
value: "frisbee,skis,snowboard,sports ball,kite,baseball bat,"
"baseball glove,skateboard,surfboard,tennis racket"
}
super_categories {
key: "kitchen"
value: "bottle,wine glass,cup,fork,knife,spoon,bowl"
}
super_categories {
key: "food"
value: "banana,apple,sandwich,orange,broccoli,carrot,hot dog,pizza,donut,"
"cake"
}
super_categories {
key: "furniture"
value: "chair,couch,potted plant,bed,dining table,toilet"
}
super_categories {
key: "electronic"
value: "tv,laptop,mouse,remote,keyboard,cell phone,microwave,oven,toaster,"
"sink,refrigerator"
}
super_categories {
key: "indoor"
value: "book,clock,vase,scissors,teddy bear,hair drier,toothbrush"
}
}
eval_input_reader: {
load_instance_masks: true
mask_type: PNG_MASKS
label_map_path: "PATH_TO_BE_CONFIGURED/label_map.txt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/val2017-?????-of-00032.tfrecord"
}
}
# DeepMAC meta architecture from the "The surprising impact of mask-head
# architecture on novel class segmentation" [1] paper with an Hourglass-100[2]
# mask head. This config is only trained on masks from the VOC classes in COCO
# and achieves a mask mAP of 35.5% on non-VOC classes.
# [1]: https://arxiv.org/abs/2104.00613
# [2]: https://arxiv.org/abs/1904.07850
# Train on TPU-128
model {
center_net {
num_classes: 90
feature_extractor {
type: "hourglass_104"
bgr_ordering: true
channel_means: [104.01362025, 114.03422265, 119.9165958 ]
channel_stds: [73.6027665 , 69.89082075, 70.9150767 ]
}
image_resizer {
keep_aspect_ratio_resizer {
min_dimension: 1024
max_dimension: 1024
pad_to_max_dimension: true
}
}
object_detection_task {
task_loss_weight: 1.0
offset_loss_weight: 1.0
scale_loss_weight: 0.1
localization_loss {
l1_localization_loss {
}
}
}
object_center_params {
object_center_loss_weight: 1.0
min_box_overlap_iou: 0.7
max_box_predictions: 100
classification_loss {
penalty_reduced_logistic_focal_loss {
alpha: 2.0
beta: 4.0
}
}
}
deepmac_mask_estimation {
dim: 32
task_loss_weight: 5.0
pixel_embedding_dim: 16
mask_size: 32
use_xy: true
use_instance_embedding: true
network_type: "hourglass100"
classification_loss {
weighted_sigmoid {}
}
allowed_masked_classes_ids: [
1, # person
2, # bicycle
3, # car
4, # motorcycle/motorbike
5, # airplane/aeroplane,
6, # bus
7, # train
9, # boat
16, # bird
17, # cat
18, # dog
19, # horse
20, # sheep
21, # cow
44, # bottle
62, # chair
63, # couch/sofa
64, # potted plant
67, # dining table
72 # tvmonitor
]
}
}
}
train_config: {
batch_size: 128
num_steps: 50000
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
random_adjust_hue {
}
}
data_augmentation_options {
random_adjust_contrast {
}
}
data_augmentation_options {
random_adjust_saturation {
}
}
data_augmentation_options {
random_adjust_brightness {
}
}
data_augmentation_options {
random_square_crop_by_scale {
scale_min: 0.6
scale_max: 1.3
}
}
optimizer {
adam_optimizer: {
epsilon: 1e-7 # Match tf.keras.optimizers.Adam's default.
learning_rate: {
cosine_decay_learning_rate {
learning_rate_base: 1e-3
total_steps: 50000
warmup_learning_rate: 2.5e-4
warmup_steps: 5000
}
}
}
use_moving_average: false
}
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
fine_tune_checkpoint_version: V2
fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/ckpt-51"
fine_tune_checkpoint_type: "fine_tune"
}
train_input_reader: {
load_instance_masks: true
label_map_path: "PATH_TO_BE_CONFIGURED/label_map.txt"
mask_type: PNG_MASKS
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/train2017-?????-of-00256.tfrecord"
}
}
eval_config: {
metrics_set: "coco_detection_metrics"
metrics_set: "coco_mask_metrics"
include_metrics_per_category: true
use_moving_averages: false
batch_size: 1;
super_categories {
key: "VOC"
value: "person,bicycle,car,motorcycle,airplane,bus,train,boat,bird,cat,"
"dog,horse,sheep,cow,bottle,chair,couch,potted plant,dining table,tv"
}
super_categories {
key: "NonVOC"
value: "truck,traffic light,fire hydrant,stop sign,parking meter,bench,"
"elephant,bear,zebra,giraffe,backpack,umbrella,handbag,tie,suitcase,"
"frisbee,skis,snowboard,sports ball,kite,baseball bat,baseball glove,"
"skateboard,surfboard,tennis racket,wine glass,cup,fork,knife,spoon,bowl,"
"banana,apple,sandwich,orange,broccoli,carrot,hot dog,pizza,donut,cake,bed,"
"toilet,laptop,mouse,remote,keyboard,cell phone,microwave,oven,toaster,"
"sink,refrigerator,book,clock,vase,scissors,teddy bear,hair drier,"
"toothbrush"
}
super_categories {
key: "person"
value: "person"
}
super_categories {
key: "vehicle"
value: "bicycle,car,motorcycle,airplane,bus,train,truck,boat"
}
super_categories {
key: "outdoor"
value: "traffic light,fire hydrant,stop sign,parking meter,bench"
}
super_categories {
key: "animal"
value: "bird,cat,dog,horse,sheep,cow,elephant,bear,zebra,giraffe"
}
super_categories {
key: "accessory"
value: "backpack,umbrella,handbag,tie,suitcase"
}
super_categories {
key: "sports"
value: "frisbee,skis,snowboard,sports ball,kite,baseball bat,"
"baseball glove,skateboard,surfboard,tennis racket"
}
super_categories {
key: "kitchen"
value: "bottle,wine glass,cup,fork,knife,spoon,bowl"
}
super_categories {
key: "food"
value: "banana,apple,sandwich,orange,broccoli,carrot,hot dog,pizza,donut,"
"cake"
}
super_categories {
key: "furniture"
value: "chair,couch,potted plant,bed,dining table,toilet"
}
super_categories {
key: "electronic"
value: "tv,laptop,mouse,remote,keyboard,cell phone,microwave,oven,toaster,"
"sink,refrigerator"
}
super_categories {
key: "indoor"
value: "book,clock,vase,scissors,teddy bear,hair drier,toothbrush"
}
}
eval_input_reader: {
load_instance_masks: true
mask_type: PNG_MASKS
label_map_path: "PATH_TO_BE_CONFIGURED/label_map.txt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/val2017-?????-of-00032.tfrecord"
}
}
# DeepMAC meta architecture from the "The surprising impact of mask-head
# architecture on novel class segmentation" [1] paper with an Hourglass-52[2]
# mask head. This config is only trained on masks from the VOC classes in COCO
# and achieves a mask mAP of 32.5% on non-VOC classes.
# [1]: https://arxiv.org/abs/2104.00613
# [2]: https://arxiv.org/abs/1904.07850
# Train on TPU-32
model {
center_net {
num_classes: 90
feature_extractor {
type: "hourglass_104"
bgr_ordering: true
channel_means: [104.01362025, 114.03422265, 119.9165958 ]
channel_stds: [73.6027665 , 69.89082075, 70.9150767 ]
}
image_resizer {
keep_aspect_ratio_resizer {
min_dimension: 512
max_dimension: 512
pad_to_max_dimension: true
}
}
object_detection_task {
task_loss_weight: 1.0
offset_loss_weight: 1.0
scale_loss_weight: 0.1
localization_loss {
l1_localization_loss {
}
}
}
object_center_params {
object_center_loss_weight: 1.0
min_box_overlap_iou: 0.7
max_box_predictions: 100
classification_loss {
penalty_reduced_logistic_focal_loss {
alpha: 2.0
beta: 4.0
}
}
}
deepmac_mask_estimation {
dim: 32
task_loss_weight: 5.0
pixel_embedding_dim: 16
mask_size: 32
use_xy: true
use_instance_embedding: true
network_type: "hourglass52"
classification_loss {
weighted_sigmoid {}
}
allowed_masked_classes_ids: [
1, # person
2, # bicycle
3, # car
4, # motorcycle/motorbike
5, # airplane/aeroplane,
6, # bus
7, # train
9, # boat
16, # bird
17, # cat
18, # dog
19, # horse
20, # sheep
21, # cow
44, # bottle
62, # chair
63, # couch/sofa
64, # potted plant
67, # dining table
72 # tvmonitor
]
}
}
}
train_config: {
batch_size: 128
num_steps: 50000
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
random_adjust_hue {
}
}
data_augmentation_options {
random_adjust_contrast {
}
}
data_augmentation_options {
random_adjust_saturation {
}
}
data_augmentation_options {
random_adjust_brightness {
}
}
data_augmentation_options {
random_square_crop_by_scale {
scale_min: 0.6
scale_max: 1.3
}
}
optimizer {
adam_optimizer: {
epsilon: 1e-7 # Match tf.keras.optimizers.Adam's default.
learning_rate: {
cosine_decay_learning_rate {
learning_rate_base: 1e-3
total_steps: 50000
warmup_learning_rate: 2.5e-4
warmup_steps: 5000
}
}
}
use_moving_average: false
}
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
fine_tune_checkpoint_version: V2
fine_tune_checkpoint: "PATH_TO_BE_CONFIGURED/ckpt-1"
fine_tune_checkpoint_type: "detection"
}
train_input_reader: {
load_instance_masks: true
label_map_path: "PATH_TO_BE_CONFIGURED/label_map.txt"
mask_type: PNG_MASKS
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/train2017-?????-of-00256.tfrecord"
}
}
eval_config: {
metrics_set: "coco_detection_metrics"
metrics_set: "coco_mask_metrics"
include_metrics_per_category: true
use_moving_averages: false
batch_size: 1;
super_categories {
key: "VOC"
value: "person,bicycle,car,motorcycle,airplane,bus,train,boat,bird,cat,"
"dog,horse,sheep,cow,bottle,chair,couch,potted plant,dining table,tv"
}
super_categories {
key: "NonVOC"
value: "truck,traffic light,fire hydrant,stop sign,parking meter,bench,"
"elephant,bear,zebra,giraffe,backpack,umbrella,handbag,tie,suitcase,"
"frisbee,skis,snowboard,sports ball,kite,baseball bat,baseball glove,"
"skateboard,surfboard,tennis racket,wine glass,cup,fork,knife,spoon,bowl,"
"banana,apple,sandwich,orange,broccoli,carrot,hot dog,pizza,donut,cake,bed,"
"toilet,laptop,mouse,remote,keyboard,cell phone,microwave,oven,toaster,"
"sink,refrigerator,book,clock,vase,scissors,teddy bear,hair drier,"
"toothbrush"
}
super_categories {
key: "person"
value: "person"
}
super_categories {
key: "vehicle"
value: "bicycle,car,motorcycle,airplane,bus,train,truck,boat"
}
super_categories {
key: "outdoor"
value: "traffic light,fire hydrant,stop sign,parking meter,bench"
}
super_categories {
key: "animal"
value: "bird,cat,dog,horse,sheep,cow,elephant,bear,zebra,giraffe"
}
super_categories {
key: "accessory"
value: "backpack,umbrella,handbag,tie,suitcase"
}
super_categories {
key: "sports"
value: "frisbee,skis,snowboard,sports ball,kite,baseball bat,"
"baseball glove,skateboard,surfboard,tennis racket"
}
super_categories {
key: "kitchen"
value: "bottle,wine glass,cup,fork,knife,spoon,bowl"
}
super_categories {
key: "food"
value: "banana,apple,sandwich,orange,broccoli,carrot,hot dog,pizza,donut,"
"cake"
}
super_categories {
key: "furniture"
value: "chair,couch,potted plant,bed,dining table,toilet"
}
super_categories {
key: "electronic"
value: "tv,laptop,mouse,remote,keyboard,cell phone,microwave,oven,toaster,"
"sink,refrigerator"
}
super_categories {
key: "indoor"
value: "book,clock,vase,scissors,teddy bear,hair drier,toothbrush"
}
}
eval_input_reader: {
load_instance_masks: true
mask_type: PNG_MASKS
label_map_path: "PATH_TO_BE_CONFIGURED/label_map.txt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "PATH_TO_BE_CONFIGURED/val2017-?????-of-00032.tfrecord"
}
}
# DeepMAC model
<!-- TODO(vighneshb) add correct arxiv links and test this page.-->
**DeepMAC** (Deep Mask heads Above CenterNet) is a neural network architecture
that is designed for the partially supervised instance segmentation task. For
details see the
[The surprising impact of mask-head architecture on novel class segmentation](https://arxiv.org/abs/2104.00613)
paper. The figure below shows improved mask predictions for unseen classes as we
use better mask-head architectures.
<p align="center">
<img src="./img/mask_improvement.png" style="width:50%;"/>
</p>
Just by using better mask-head architectures (no extra losses or modules) we
achieve state-of-the-art performance in the partially supervised instance
segmentation task.
## Code structure
* `deepmac_meta_arch.py` implements our main architecture, DeepMAC, on top of
the CenterNet detection architecture.
* The proto message `DeepMACMaskEstimation` in `center_net.proto` controls the
configutation of the mask head used.
* The field `allowed_masked_classes_ids` controls which classes recieve mask
supervision during training.
* Mask R-CNN based ablations in the paper are implemented in the
[TF model garden]() code base.
## Prerequisites
1. Follow [TF2 install instructions](tf2.md) to install Object Detection API.
2. Generate COCO dataset by using
[create_coco_tf_record.py](../../official/vision/beta/data/create_coco_tf_record.py)
## Configurations
We provide pre-defined configs which can be run as a
[TF2 training pipeline](tf2_training_and_evaluation.md). Each of these
configurations needs to be passed as the `pipeline_config_path` argument to the
`object_detection/model_main_tf2.py` binary. Note that the `512x512` resolution
models require a TPU `v3-32` and the `1024x1024` resolution models require a TPU
`v3-128` to train. The configs can be found in the [configs/tf2](../configs/tf2)
directory. In the table below `X->Y` indicates that we train with masks from `X`
and evaluate with masks from `Y`. Performance is measured on the `coco-val2017`
set.
### Partially supervised models
Resolution | Mask head | Train->Eval | Config name | Mask mAP
:--------- | :------------ | :------------- | :------------------------------------------------- | -------:
512x512 | Hourglass-52 | VOC -> Non-VOC | `center_net_deepmac_512x512_voc_only.config` | 32.5
1024x1024 | Hourglass-100 | VOC -> Non-VOC | `center_net_deepmac_1024x1024_voc_only.config` | 35.5
1024x1024 | Hourglass-100 | Non-VOC -> VOC | `center_net_deepmac_1024x1024_non_voc_only.config` | 39.1
### Fully supervised models
Here we report the Mask mAP averaged over all COCO classes on the `test-dev2017`
set .
Resolution | Mask head | Config name | Mask mAP
:--------- | :------------ | :----------------------------------------- | -------:
1024x1024 | Hourglass-100 | `center_net_deepmac_1024x1024_coco.config` | 39.4
## Demos
* [DeepMAC Colab](../colab_tutorials/deepmac_colab.ipynb) lets you run a
pre-trained DeepMAC model on user-specified boxes. Note that you are not
restricted to COCO classes!
## Pre-trained models
* [COCO Checkpoint](http://download.tensorflow.org/models/object_detection/tf2/20210329/deepmac_1024x1024_coco17.tar.gz) -
Takes as input Image + Boxes and produces per-box instance masks as output.
## See also
* [Mask RCNN code](https://github.com/tensorflow/models/tree/master/official/vision/beta/projects/deepmac_maskrcnn)
in TF Model garden code base.
* Project website - [git.io/deepmac](https://git.io/deepmac)
## Citation
```
@misc{birodkar2021surprising,
title={The surprising impact of mask-head architecture on novel class segmentation},
author={Vighnesh Birodkar and Zhichao Lu and Siyang Li and Vivek Rathod and Jonathan Huang},
year={2021},
eprint={2104.00613},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
"""Deep Mask heads above CenterNet (DeepMAC) architecture.
TODO(vighneshb) Add link to paper when done.
"""
import collections
import numpy as np
import tensorflow as tf
from object_detection.builders import losses_builder
from object_detection.core import box_list
from object_detection.core import box_list_ops
from object_detection.core import losses
from object_detection.core import standard_fields as fields
from object_detection.meta_architectures import center_net_meta_arch
from object_detection.models.keras_models import hourglass_network
from object_detection.models.keras_models import resnet_v1
from object_detection.protos import losses_pb2
from object_detection.utils import shape_utils
from object_detection.utils import spatial_transform_ops
INSTANCE_EMBEDDING = 'INSTANCE_EMBEDDING'
PIXEL_EMBEDDING = 'PIXEL_EMBEDDING'
DEEP_MASK_ESTIMATION = 'deep_mask_estimation'
LOSS_KEY_PREFIX = center_net_meta_arch.LOSS_KEY_PREFIX
class DeepMACParams(
collections.namedtuple('DeepMACParams', [
'classification_loss', 'dim', 'task_loss_weight', 'pixel_embedding_dim',
'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples',
'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels',
'predict_full_resolution_masks', 'postprocess_crop_size'
])):
"""Class holding the DeepMAC network configutration."""
__slots__ = ()
def __new__(cls, classification_loss, dim, task_loss_weight,
pixel_embedding_dim, allowed_masked_classes_ids, mask_size,
mask_num_subsamples, use_xy, network_type, use_instance_embedding,
num_init_channels, predict_full_resolution_masks,
postprocess_crop_size):
return super(DeepMACParams,
cls).__new__(cls, classification_loss, dim,
task_loss_weight, pixel_embedding_dim,
allowed_masked_classes_ids, mask_size,
mask_num_subsamples, use_xy, network_type,
use_instance_embedding, num_init_channels,
predict_full_resolution_masks,
postprocess_crop_size)
def subsample_instances(classes, weights, boxes, masks, num_subsamples):
"""Randomly subsamples instances to the desired number.
Args:
classes: [num_instances, num_classes] float tensor of one-hot encoded
classes.
weights: [num_instances] float tensor of weights of each instance.
boxes: [num_instances, 4] tensor of box coordinates.
masks: [num_instances, height, width] tensor of per-instance masks.
num_subsamples: int, the desired number of samples.
Returns:
classes: [num_subsamples, num_classes] float tensor of classes.
weights: [num_subsamples] float tensor of weights.
boxes: [num_subsamples, 4] float tensor of box coordinates.
masks: [num_subsamples, height, width] float tensor of per-instance masks.
"""
if num_subsamples <= -1:
return classes, weights, boxes, masks
num_instances = tf.reduce_sum(tf.cast(weights > 0.5, tf.int32))
if num_instances <= num_subsamples:
return (classes[:num_subsamples], weights[:num_subsamples],
boxes[:num_subsamples], masks[:num_subsamples])
else:
random_index = tf.random.uniform([num_subsamples], 0, num_instances,
dtype=tf.int32)
return (tf.gather(classes, random_index), tf.gather(weights, random_index),
tf.gather(boxes, random_index), tf.gather(masks, random_index))
def _get_deepmac_network_by_type(name, num_init_channels, mask_size=None):
"""Get DeepMAC network model given a string type."""
if name.startswith('hourglass'):
if name == 'hourglass10':
return hourglass_network.hourglass_10(num_init_channels,
initial_downsample=False)
elif name == 'hourglass20':
return hourglass_network.hourglass_20(num_init_channels,
initial_downsample=False)
elif name == 'hourglass32':
return hourglass_network.hourglass_32(num_init_channels,
initial_downsample=False)
elif name == 'hourglass52':
return hourglass_network.hourglass_52(num_init_channels,
initial_downsample=False)
elif name == 'hourglass100':
return hourglass_network.hourglass_100(num_init_channels,
initial_downsample=False)
elif name == 'hourglass20_uniform_size':
return hourglass_network.hourglass_20_uniform_size(num_init_channels)
elif name == 'hourglass20_no_shortcut':
return hourglass_network.hourglass_20_no_shortcut(num_init_channels)
elif name == 'fully_connected':
if not mask_size:
raise ValueError('Mask size must be set.')
return FullyConnectedMaskHead(num_init_channels, mask_size)
elif name.startswith('resnet'):
return ResNetMaskNetwork(name, num_init_channels)
raise ValueError('Unknown network type {}'.format(name))
def crop_masks_within_boxes(masks, boxes, output_size):
"""Crops masks to lie tightly within the boxes.
Args:
masks: A [num_instances, height, width] float tensor of masks.
boxes: A [num_instances, 4] sized tensor of normalized bounding boxes.
output_size: The height and width of the output masks.
Returns:
masks: A [num_instances, output_size, output_size] tensor of masks which
are cropped to be tightly within the gives boxes and resized.
"""
masks = spatial_transform_ops.matmul_crop_and_resize(
masks[:, :, :, tf.newaxis], boxes[:, tf.newaxis, :],
[output_size, output_size])
return masks[:, 0, :, :, 0]
def resize_instance_masks(masks, shape):
height, width = shape
masks_ex = masks[:, :, :, tf.newaxis]
masks_ex = tf.image.resize(masks_ex, (height, width),
method=tf.image.ResizeMethod.BILINEAR)
masks = masks_ex[:, :, :, 0]
return masks
def filter_masked_classes(masked_class_ids, classes, weights, masks):
"""Filter out masks whose class IDs are not present in masked_class_ids.
Args:
masked_class_ids: A list of class IDs allowed to have masks. These class IDs
are 1-indexed.
classes: A [num_instances, num_classes] float tensor containing the one-hot
encoded classes.
weights: A [num_instances] float tensor containing the weights of each
sample.
masks: A [num_instances, height, width] tensor containing the mask per
instance.
Returns:
classes_filtered: A [num_instances, num_classes] float tensor containing the
one-hot encoded classes with classes not in masked_class_ids zeroed out.
weights_filtered: A [num_instances] float tensor containing the weights of
each sample with instances whose classes aren't in masked_class_ids
zeroed out.
masks_filtered: A [num_instances, height, width] tensor containing the mask
per instance with masks not belonging to masked_class_ids zeroed out.
"""
if len(masked_class_ids) == 0: # pylint:disable=g-explicit-length-test
return classes, weights, masks
if tf.shape(classes)[0] == 0:
return classes, weights, masks
masked_class_ids = tf.constant(np.array(masked_class_ids, dtype=np.int32))
label_id_offset = 1
masked_class_ids -= label_id_offset
class_ids = tf.argmax(classes, axis=1, output_type=tf.int32)
matched_classes = tf.equal(
class_ids[:, tf.newaxis], masked_class_ids[tf.newaxis, :]
)
matched_classes = tf.reduce_any(matched_classes, axis=1)
matched_classes = tf.cast(matched_classes, tf.float32)
return (
classes * matched_classes[:, tf.newaxis],
weights * matched_classes,
masks * matched_classes[:, tf.newaxis, tf.newaxis]
)
class ResNetMaskNetwork(tf.keras.layers.Layer):
"""A small wrapper around ResNet blocks to predict masks."""
def __init__(self, resnet_type, num_init_channels):
"""Creates the ResNet mask network.
Args:
resnet_type: A string of the for resnetN where N where N is in
[4, 8, 12, 16, 20]
num_init_channels: Number of filters in the ResNet block.
"""
super(ResNetMaskNetwork, self).__init__()
nc = num_init_channels
if resnet_type == 'resnet4':
channel_dims = [nc * 2]
blocks = [2]
elif resnet_type == 'resnet8':
channel_dims = [nc * 2]
blocks = [4]
elif resnet_type == 'resnet12':
channel_dims = [nc * 2]
blocks = [6]
elif resnet_type == 'resnet16':
channel_dims = [nc * 2]
blocks = [8]
# Defined such that the channels are roughly similar to the hourglass20.
elif resnet_type == 'resnet20':
channel_dims = [nc * 2, nc * 3]
blocks = [8, 2]
else:
raise ValueError('Unknown resnet type "{}"'.format(resnet_type))
self.input_layer = tf.keras.layers.Conv2D(nc, 1, 1)
# Last channel has to be defined so that batch norm can initialize properly.
model_input = tf.keras.layers.Input([None, None, nc])
output = model_input
for i, (num_blocks, channels) in enumerate(zip(blocks, channel_dims)):
output = resnet_v1.stack_basic(output, filters=channels,
blocks=num_blocks, stride1=1,
name='resnet_mask_block_%d' % i)
self.model = tf.keras.Model(inputs=model_input, outputs=output)
def __call__(self, inputs):
return self.model(self.input_layer(inputs))
class FullyConnectedMaskHead(tf.keras.layers.Layer):
"""A 2 layer fully connected mask head."""
def __init__(self, num_init_channels, mask_size):
super(FullyConnectedMaskHead, self).__init__()
self.fc1 = tf.keras.layers.Dense(units=1024, activation='relu')
self.fc2 = tf.keras.layers.Dense(units=mask_size*mask_size)
self.mask_size = mask_size
self.num_input_channels = num_init_channels
self.input_layer = tf.keras.layers.Conv2D(num_init_channels, 1, 1)
model_input = tf.keras.layers.Input(
[mask_size * mask_size * num_init_channels,])
output = self.fc2(self.fc1(model_input))
self.model = tf.keras.Model(inputs=model_input, outputs=output)
def __call__(self, inputs):
inputs = self.input_layer(inputs)
inputs_shape = tf.shape(inputs)
num_instances = inputs_shape[0]
height = inputs_shape[1]
width = inputs_shape[2]
dims = inputs_shape[3]
flattened_inputs = tf.reshape(inputs,
[num_instances, height * width * dims])
flattened_masks = self.model(flattened_inputs)
return tf.reshape(flattened_masks,
[num_instances, self.mask_size, self.mask_size, 1])
class MaskHeadNetwork(tf.keras.layers.Layer):
"""Mask head class for DeepMAC."""
def __init__(self, network_type, num_init_channels=64,
use_instance_embedding=True, mask_size=None):
"""Initializes the network.
Args:
network_type: A string denoting the kind of network we want to use
internally.
num_init_channels: int, the number of channels in the first block. The
number of channels in the following blocks depend on the network type
used.
use_instance_embedding: bool, if set, we concatenate the instance
embedding to the input while predicting the mask.
mask_size: int, size of the output mask. Required only with
`fully_connected` mask type.
"""
super(MaskHeadNetwork, self).__init__()
self._net = _get_deepmac_network_by_type(
network_type, num_init_channels, mask_size)
self._use_instance_embedding = use_instance_embedding
self.project_out = tf.keras.layers.Conv2D(
filters=1, kernel_size=1, activation=None)
def __call__(self, instance_embedding, pixel_embedding, training):
"""Returns mask logits given object center and spatial embeddings.
Args:
instance_embedding: A [num_instances, embedding_size] float tensor
representing the center emedding vector of each instance.
pixel_embedding: A [num_instances, height, width, pixel_embedding_size]
float tensor representing the per-pixel spatial embedding for each
instance.
training: boolean flag indicating training or testing mode.
Returns:
mask: A [num_instances, height, width] float tensor containing the mask
logits for each instance.
"""
height = tf.shape(pixel_embedding)[1]
width = tf.shape(pixel_embedding)[2]
instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
instance_embedding = tf.tile(instance_embedding, [1, height, width, 1])
if self._use_instance_embedding:
inputs = tf.concat([pixel_embedding, instance_embedding], axis=3)
else:
inputs = pixel_embedding
out = self._net(inputs)
if isinstance(out, list):
out = out[-1]
if out.shape[-1] > 1:
out = self.project_out(out)
return tf.squeeze(out, axis=-1)
def deepmac_proto_to_params(deepmac_config):
"""Convert proto to named tuple."""
loss = losses_pb2.Loss()
# Add dummy localization loss to avoid the loss_builder throwing error.
loss.localization_loss.weighted_l2.CopyFrom(
losses_pb2.WeightedL2LocalizationLoss())
loss.classification_loss.CopyFrom(deepmac_config.classification_loss)
classification_loss, _, _, _, _, _, _ = (losses_builder.build(loss))
return DeepMACParams(
dim=deepmac_config.dim,
classification_loss=classification_loss,
task_loss_weight=deepmac_config.task_loss_weight,
pixel_embedding_dim=deepmac_config.pixel_embedding_dim,
allowed_masked_classes_ids=deepmac_config.allowed_masked_classes_ids,
mask_size=deepmac_config.mask_size,
mask_num_subsamples=deepmac_config.mask_num_subsamples,
use_xy=deepmac_config.use_xy,
network_type=deepmac_config.network_type,
use_instance_embedding=deepmac_config.use_instance_embedding,
num_init_channels=deepmac_config.num_init_channels,
predict_full_resolution_masks=
deepmac_config.predict_full_resolution_masks,
postprocess_crop_size=deepmac_config.postprocess_crop_size
)
class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
"""The experimental CenterNet DeepMAC[1] model.
[1]: https://arxiv.org/abs/2104.00613
"""
def __init__(self,
is_training,
add_summaries,
num_classes,
feature_extractor,
image_resizer_fn,
object_center_params,
object_detection_params,
deepmac_params,
compute_heatmap_sparse=False):
"""Constructs the super class with object center & detection params only."""
self._deepmac_params = deepmac_params
super(DeepMACMetaArch, self).__init__(
is_training=is_training, add_summaries=add_summaries,
num_classes=num_classes, feature_extractor=feature_extractor,
image_resizer_fn=image_resizer_fn,
object_center_params=object_center_params,
object_detection_params=object_detection_params,
compute_heatmap_sparse=compute_heatmap_sparse)
def _construct_prediction_heads(self, num_classes, num_feature_outputs,
class_prediction_bias_init):
super_instance = super(DeepMACMetaArch, self)
prediction_heads = super_instance._construct_prediction_heads( # pylint:disable=protected-access
num_classes, num_feature_outputs, class_prediction_bias_init)
if self._deepmac_params is not None:
prediction_heads[INSTANCE_EMBEDDING] = [
center_net_meta_arch.make_prediction_net(self._deepmac_params.dim)
for _ in range(num_feature_outputs)
]
prediction_heads[PIXEL_EMBEDDING] = [
center_net_meta_arch.make_prediction_net(
self._deepmac_params.pixel_embedding_dim)
for _ in range(num_feature_outputs)
]
self._mask_net = MaskHeadNetwork(
network_type=self._deepmac_params.network_type,
use_instance_embedding=self._deepmac_params.use_instance_embedding,
num_init_channels=self._deepmac_params.num_init_channels)
return prediction_heads
def _get_mask_head_input(self, boxes, pixel_embedding):
"""Get the input to the mask network, given bounding boxes.
Args:
boxes: A [num_instances, 4] float tensor containing bounding boxes in
normalized coordinates.
pixel_embedding: A [height, width, embedding_size] float tensor
containing spatial pixel embeddings.
Returns:
embedding: A [num_instances, mask_height, mask_width, embedding_size + 2]
float tensor containing the inputs to the mask network. For each
bounding box, we concatenate the normalized box coordinates to the
cropped pixel embeddings. If predict_full_resolution_masks is set,
mask_height and mask_width are the same as height and width of
pixel_embedding. If not, mask_height and mask_width are the same as
mask_size.
"""
num_instances = tf.shape(boxes)[0]
mask_size = self._deepmac_params.mask_size
if self._deepmac_params.predict_full_resolution_masks:
num_instances = tf.shape(boxes)[0]
pixel_embedding = pixel_embedding[tf.newaxis, :, :, :]
pixel_embeddings_processed = tf.tile(pixel_embedding,
[num_instances, 1, 1, 1])
else:
# TODO(vighneshb) Explore multilevel_roi_align and align_corners=False.
pixel_embeddings_cropped = spatial_transform_ops.matmul_crop_and_resize(
pixel_embedding[tf.newaxis], boxes[tf.newaxis],
[mask_size, mask_size])
pixel_embeddings_processed = pixel_embeddings_cropped[0]
mask_shape = tf.shape(pixel_embeddings_processed)
mask_height, mask_width = mask_shape[1], mask_shape[2]
y_grid, x_grid = tf.meshgrid(tf.linspace(-1.0, 1.0, mask_height),
tf.linspace(-1.0, 1.0, mask_width),
indexing='ij')
coords = tf.stack([y_grid, x_grid], axis=2)
coords = coords[tf.newaxis, :, :, :]
coords = tf.tile(coords, [num_instances, 1, 1, 1])
if self._deepmac_params.use_xy:
return tf.concat([coords, pixel_embeddings_processed], axis=3)
else:
return pixel_embeddings_processed
def _get_instance_embeddings(self, boxes, instance_embedding):
"""Return the instance embeddings from bounding box centers.
Args:
boxes: A [num_instances, 4] float tensor holding bounding boxes. The
coordinates are in normalized input space.
instance_embedding: A [height, width, embedding_size] float tensor
containing the instance embeddings.
Returns:
instance_embeddings: A [num_instances, embedding_size] shaped float tensor
containing the center embedding for each instance.
"""
blist = box_list.BoxList(boxes)
output_height = tf.shape(instance_embedding)[0]
output_width = tf.shape(instance_embedding)[1]
blist_output = box_list_ops.to_absolute_coordinates(
blist, output_height, output_width, check_range=False)
(y_center_output, x_center_output,
_, _) = blist_output.get_center_coordinates_and_sizes()
center_coords_output = tf.stack([y_center_output, x_center_output], axis=1)
center_coords_output_int = tf.cast(center_coords_output, tf.int32)
center_latents = tf.gather_nd(instance_embedding, center_coords_output_int)
return center_latents
def _get_groundtruth_mask_output(self, boxes, masks):
"""Get the expected mask output for each box.
Args:
boxes: A [num_instances, 4] float tensor containing bounding boxes in
normalized coordinates.
masks: A [num_instances, height, width] float tensor containing binary
ground truth masks.
Returns:
masks: If predict_full_resolution_masks is set, masks are not resized
and the size of this tensor is [num_instances, input_height, input_width].
Otherwise, returns a tensor of size [num_instances, mask_size, mask_size].
"""
mask_size = self._deepmac_params.mask_size
if self._deepmac_params.predict_full_resolution_masks:
return masks
else:
cropped_masks = spatial_transform_ops.matmul_crop_and_resize(
masks[:, :, :, tf.newaxis], boxes[:, tf.newaxis, :],
[mask_size, mask_size])
cropped_masks = tf.stop_gradient(cropped_masks)
cropped_masks = tf.squeeze(cropped_masks, axis=[1, 4])
# TODO(vighneshb) should we discretize masks?
return cropped_masks
def _resize_logits_like_gt(self, logits, gt):
height, width = tf.shape(gt)[1], tf.shape(gt)[2]
return resize_instance_masks(logits, (height, width))
def _compute_per_instance_mask_loss(
self, boxes, masks, instance_embedding, pixel_embedding):
"""Returns the mask loss per instance.
Args:
boxes: A [num_instances, 4] float tensor holding bounding boxes. The
coordinates are in normalized input space.
masks: A [num_instances, input_height, input_width] float tensor
containing the instance masks.
instance_embedding: A [output_height, output_width, embedding_size]
float tensor containing the instance embeddings.
pixel_embedding: optional [output_height, output_width,
pixel_embedding_size] float tensor containing the per-pixel embeddings.
Returns:
mask_loss: A [num_instances] shaped float tensor containing the
mask loss for each instance.
"""
num_instances = tf.shape(boxes)[0]
mask_input = self._get_mask_head_input(
boxes, pixel_embedding)
instance_embeddings = self._get_instance_embeddings(
boxes, instance_embedding)
mask_logits = self._mask_net(
instance_embeddings, mask_input,
training=tf.keras.backend.learning_phase())
mask_gt = self._get_groundtruth_mask_output(boxes, masks)
mask_logits = self._resize_logits_like_gt(mask_logits, mask_gt)
mask_logits = tf.reshape(mask_logits, [num_instances, -1, 1])
mask_gt = tf.reshape(mask_gt, [num_instances, -1, 1])
loss = self._deepmac_params.classification_loss(
prediction_tensor=mask_logits,
target_tensor=mask_gt,
weights=tf.ones_like(mask_logits))
# TODO(vighneshb) Make this configurable via config.
if isinstance(self._deepmac_params.classification_loss,
losses.WeightedDiceClassificationLoss):
return tf.reduce_sum(loss, axis=1)
else:
return tf.reduce_mean(loss, axis=[1, 2])
def _compute_instance_masks_loss(self, prediction_dict):
"""Computes the mask loss.
Args:
prediction_dict: dict from predict() method containing
INSTANCE_EMBEDDING and PIXEL_EMBEDDING prediction.
Both of these are lists of tensors, each of size
[batch_size, height, width, embedding_size].
Returns:
loss: float, the mask loss as a scalar.
"""
gt_boxes_list = self.groundtruth_lists(fields.BoxListFields.boxes)
gt_weights_list = self.groundtruth_lists(fields.BoxListFields.weights)
gt_masks_list = self.groundtruth_lists(fields.BoxListFields.masks)
gt_classes_list = self.groundtruth_lists(fields.BoxListFields.classes)
allowed_masked_classes_ids = (
self._deepmac_params.allowed_masked_classes_ids)
total_loss = 0.0
# Iterate over multiple preidctions by backbone (for hourglass length=2)
for instance_pred, pixel_pred in zip(
prediction_dict[INSTANCE_EMBEDDING],
prediction_dict[PIXEL_EMBEDDING]):
# Iterate over samples in batch
# TODO(vighneshb) find out how autograph is handling this. Converting
# to a single op may give speed improvements
for i, (boxes, weights, classes, masks) in enumerate(
zip(gt_boxes_list, gt_weights_list, gt_classes_list, gt_masks_list)):
_, weights, masks = filter_masked_classes(allowed_masked_classes_ids,
classes, weights, masks)
num_subsample = self._deepmac_params.mask_num_subsamples
_, weights, boxes, masks = subsample_instances(
classes, weights, boxes, masks, num_subsample)
per_instance_loss = self._compute_per_instance_mask_loss(
boxes, masks, instance_pred[i], pixel_pred[i])
per_instance_loss *= weights
num_instances = tf.maximum(tf.reduce_sum(weights), 1.0)
total_loss += tf.reduce_sum(per_instance_loss) / num_instances
batch_size = len(gt_boxes_list)
num_predictions = len(prediction_dict[INSTANCE_EMBEDDING])
return total_loss / float(batch_size * num_predictions)
def loss(self, prediction_dict, true_image_shapes, scope=None):
losses_dict = super(DeepMACMetaArch, self).loss(
prediction_dict, true_image_shapes, scope)
if self._deepmac_params is not None:
mask_loss = self._compute_instance_masks_loss(
prediction_dict=prediction_dict)
key = LOSS_KEY_PREFIX + '/' + DEEP_MASK_ESTIMATION
losses_dict[key] = (
self._deepmac_params.task_loss_weight * mask_loss
)
return losses_dict
def postprocess(self, prediction_dict, true_image_shapes, **params):
"""Produces boxes given a prediction dict returned by predict().
Args:
prediction_dict: a dictionary holding predicted tensors from "predict"
function.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is of
the form [height, width, channels] indicating the shapes of true images
in the resized images, as resized images can be padded with zeros.
**params: Currently ignored.
Returns:
detections: a dictionary containing the following fields
detection_masks: (Optional) A uint8 tensor of shape [batch,
max_detections, mask_height, mask_width] with masks for each
detection. Background is specified with 0, and foreground is specified
with positive integers (1 for standard instance segmentation mask, and
1-indexed parts for DensePose task).
And all other fields returned by the super class method.
"""
postprocess_dict = super(DeepMACMetaArch, self).postprocess(
prediction_dict, true_image_shapes, **params)
boxes_strided = postprocess_dict['detection_boxes_strided']
if self._deepmac_params is not None:
masks = self._postprocess_masks(
boxes_strided, prediction_dict[INSTANCE_EMBEDDING][-1],
prediction_dict[PIXEL_EMBEDDING][-1])
postprocess_dict[fields.DetectionResultFields.detection_masks] = masks
return postprocess_dict
def _postprocess_masks(self, boxes_output_stride,
instance_embedding, pixel_embedding):
"""Postprocess masks with the deep mask network.
Args:
boxes_output_stride: A [batch_size, num_instances, 4] float tensor
containing the batch of boxes in the absolute output space of the
feature extractor.
instance_embedding: A [batch_size, output_height, output_width,
embedding_size] float tensor containing instance embeddings.
pixel_embedding: A [batch_size, output_height, output_width,
pixel_embedding_size] float tensor containing the per-pixel embedding.
Returns:
masks: A float tensor of size [batch_size, num_instances, mask_size,
mask_size] containing binary per-box instance masks.
"""
def process(elems):
boxes, instance_embedding, pixel_embedding = elems
return self._postprocess_sample(boxes, instance_embedding,
pixel_embedding)
max_instances = self._center_params.max_box_predictions
return tf.map_fn(process, [boxes_output_stride, instance_embedding,
pixel_embedding],
dtype=tf.float32, parallel_iterations=max_instances)
def _postprocess_sample(self, boxes_output_stride,
instance_embedding, pixel_embedding):
"""Post process masks for a single sample.
Args:
boxes_output_stride: A [num_instances, 4] float tensor containing
bounding boxes in the absolute output space.
instance_embedding: A [output_height, output_width, embedding_size]
float tensor containing instance embeddings.
pixel_embedding: A [batch_size, output_height, output_width,
pixel_embedding_size] float tensor containing the per-pixel embedding.
Returns:
masks: A float tensor of size [num_instances, mask_height, mask_width]
containing binary per-box instance masks. If
predict_full_resolution_masks is set, the masks will be resized to
postprocess_crop_size. Otherwise, mask_height=mask_width=mask_size
"""
height, width = (tf.shape(instance_embedding)[0],
tf.shape(instance_embedding)[1])
height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
blist = box_list.BoxList(boxes_output_stride)
blist = box_list_ops.to_normalized_coordinates(
blist, height, width, check_range=False)
boxes = blist.get()
mask_input = self._get_mask_head_input(boxes, pixel_embedding)
instance_embeddings = self._get_instance_embeddings(
boxes, instance_embedding)
mask_logits = self._mask_net(
instance_embeddings, mask_input,
training=tf.keras.backend.learning_phase())
# TODO(vighneshb) Explore sweeping mask thresholds.
if self._deepmac_params.predict_full_resolution_masks:
height, width = tf.shape(mask_logits)[1], tf.shape(mask_logits)[2]
height *= self._stride
width *= self._stride
mask_logits = resize_instance_masks(mask_logits, (height, width))
mask_logits = crop_masks_within_boxes(
mask_logits, boxes, self._deepmac_params.postprocess_crop_size)
masks_prob = tf.nn.sigmoid(mask_logits)
return masks_prob
def _transform_boxes_to_feature_coordinates(self, provided_boxes,
true_image_shapes,
resized_image_shape,
instance_embedding):
"""Transforms normalzied boxes to feature map coordinates.
Args:
provided_boxes: A [batch, num_instances, 4] float tensor containing
normalized bounding boxes.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is of
the form [height, width, channels] indicating the shapes of true images
in the resized images, as resized images can be padded with zeros.
resized_image_shape: A 4D int32 tensor containing shapes of the
preprocessed inputs (N, H, W, C).
instance_embedding: A [batch, output_height, output_width, embedding_size]
float tensor containing instance embeddings.
Returns:
A float tensor of size [batch, num_instances, 4] containing boxes whose
coordinates have been transformed to the absolute output space of the
feature extractor.
"""
# Input boxes must be normalized.
shape_utils.assert_box_normalized(provided_boxes)
# Transform the provided boxes to the absolute output space of the feature
# extractor.
height, width = (tf.shape(instance_embedding)[1],
tf.shape(instance_embedding)[2])
resized_image_height = resized_image_shape[1]
resized_image_width = resized_image_shape[2]
def transform_boxes(elems):
boxes_per_image, true_image_shape = elems
blist = box_list.BoxList(boxes_per_image)
# First transform boxes from image space to resized image space since
# there may have paddings in the resized images.
blist = box_list_ops.scale(blist,
true_image_shape[0] / resized_image_height,
true_image_shape[1] / resized_image_width)
# Then transform boxes from resized image space (normalized) to the
# feature map space (absolute).
blist = box_list_ops.to_absolute_coordinates(
blist, height, width, check_range=False)
return blist.get()
return tf.map_fn(
transform_boxes, [provided_boxes, true_image_shapes], dtype=tf.float32)
def predict_masks_from_boxes(self, prediction_dict, true_image_shapes,
provided_boxes, **params):
"""Produces masks for the provided boxes.
Args:
prediction_dict: a dictionary holding predicted tensors from "predict"
function.
true_image_shapes: int32 tensor of shape [batch, 3] where each row is of
the form [height, width, channels] indicating the shapes of true images
in the resized images, as resized images can be padded with zeros.
provided_boxes: float tensor of shape [batch, num_boxes, 4] containing
boxes coordinates (normalized) from which we will produce masks.
**params: Currently ignored.
Returns:
detections: a dictionary containing the following fields
detection_masks: (Optional) A uint8 tensor of shape [batch,
max_detections, mask_height, mask_width] with masks for each
detection. Background is specified with 0, and foreground is specified
with positive integers (1 for standard instance segmentation mask, and
1-indexed parts for DensePose task).
And all other fields returned by the super class method.
"""
postprocess_dict = super(DeepMACMetaArch,
self).postprocess(prediction_dict,
true_image_shapes, **params)
instance_embedding = prediction_dict[INSTANCE_EMBEDDING][-1]
resized_image_shapes = shape_utils.combined_static_and_dynamic_shape(
prediction_dict['preprocessed_inputs'])
boxes_strided = self._transform_boxes_to_feature_coordinates(
provided_boxes, true_image_shapes, resized_image_shapes,
instance_embedding)
if self._deepmac_params is not None:
masks = self._postprocess_masks(
boxes_strided, instance_embedding,
prediction_dict[PIXEL_EMBEDDING][-1])
postprocess_dict[fields.DetectionResultFields.detection_masks] = masks
return postprocess_dict
"""Tests for google3.third_party.tensorflow_models.object_detection.meta_architectures.deepmac_meta_arch."""
import functools
import unittest
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from object_detection.core import losses
from object_detection.core import preprocessor
from object_detection.meta_architectures import center_net_meta_arch
from object_detection.meta_architectures import deepmac_meta_arch
from object_detection.utils import tf_version
class DummyFeatureExtractor(center_net_meta_arch.CenterNetFeatureExtractor):
def __init__(self,
channel_means,
channel_stds,
bgr_ordering,
num_feature_outputs,
stride):
self._num_feature_outputs = num_feature_outputs
self._stride = stride
super(DummyFeatureExtractor, self).__init__(
channel_means=channel_means, channel_stds=channel_stds,
bgr_ordering=bgr_ordering)
def predict(self):
pass
def loss(self):
pass
def postprocess(self):
pass
def call(self, inputs):
batch_size, input_height, input_width, _ = inputs.shape
fake_output = tf.ones([
batch_size, input_height // self._stride, input_width // self._stride,
64
], dtype=tf.float32)
return [fake_output] * self._num_feature_outputs
@property
def out_stride(self):
return self._stride
@property
def num_feature_outputs(self):
return self._num_feature_outputs
class MockMaskNet(tf.keras.layers.Layer):
def __call__(self, instance_embedding, pixel_embedding, training):
return tf.zeros_like(pixel_embedding[:, :, :, 0]) + 0.9
def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False):
"""Builds the DeepMAC meta architecture."""
feature_extractor = DummyFeatureExtractor(
channel_means=(1.0, 2.0, 3.0),
channel_stds=(10., 20., 30.),
bgr_ordering=False,
num_feature_outputs=2,
stride=4)
image_resizer_fn = functools.partial(
preprocessor.resize_to_range,
min_dimension=128,
max_dimension=128,
pad_to_max_dimesnion=True)
object_center_params = center_net_meta_arch.ObjectCenterParams(
classification_loss=losses.WeightedSigmoidClassificationLoss(),
object_center_loss_weight=1.0,
min_box_overlap_iou=1.0,
max_box_predictions=5,
use_labeled_classes=False)
if use_dice_loss:
classification_loss = losses.WeightedDiceClassificationLoss(False)
else:
classification_loss = losses.WeightedSigmoidClassificationLoss()
deepmac_params = deepmac_meta_arch.DeepMACParams(
classification_loss=classification_loss,
dim=8,
task_loss_weight=1.0,
pixel_embedding_dim=2,
allowed_masked_classes_ids=[],
mask_size=16,
mask_num_subsamples=-1,
use_xy=True,
network_type='hourglass10',
use_instance_embedding=True,
num_init_channels=8,
predict_full_resolution_masks=predict_full_resolution_masks,
postprocess_crop_size=128
)
object_detection_params = center_net_meta_arch.ObjectDetectionParams(
localization_loss=losses.L1LocalizationLoss(),
offset_loss_weight=1.0,
scale_loss_weight=0.1
)
return deepmac_meta_arch.DeepMACMetaArch(
is_training=True,
add_summaries=False,
num_classes=6,
feature_extractor=feature_extractor,
object_center_params=object_center_params,
deepmac_params=deepmac_params,
object_detection_params=object_detection_params,
image_resizer_fn=image_resizer_fn)
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACUtilsTest(tf.test.TestCase):
def test_subsample_trivial(self):
"""Test subsampling masks."""
boxes = np.arange(4).reshape(4, 1) * np.ones((4, 4))
masks = np.arange(4).reshape(4, 1, 1) * np.ones((4, 32, 32))
weights = np.ones(4)
classes = tf.one_hot(tf.range(4), depth=4)
result = deepmac_meta_arch.subsample_instances(
classes, weights, boxes, masks, 4)
self.assertAllClose(result[0], classes)
self.assertAllClose(result[1], weights)
self.assertAllClose(result[2], boxes)
self.assertAllClose(result[3], masks)
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACMetaArchTest(tf.test.TestCase):
def setUp(self): # pylint:disable=g-missing-super-call
self.model = build_meta_arch()
def test_mask_network(self):
net = deepmac_meta_arch.MaskHeadNetwork('hourglass10', 8)
out = net(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
def test_mask_network_hourglass20(self):
net = deepmac_meta_arch.MaskHeadNetwork('hourglass20', 8)
out = net(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
def test_mask_network_resnet(self):
net = deepmac_meta_arch.MaskHeadNetwork('resnet4')
out = net(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
def test_mask_network_resnet_tf_function(self):
net = deepmac_meta_arch.MaskHeadNetwork('resnet8')
call_func = tf.function(net.__call__)
out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
def test_get_mask_head_input(self):
boxes = tf.constant([[0., 0., 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]],
dtype=tf.float32)
pixel_embedding = np.zeros((32, 32, 4), dtype=np.float32)
pixel_embedding[:16, :16] = 1.0
pixel_embedding[16:, 16:] = 2.0
pixel_embedding = tf.constant(pixel_embedding)
mask_inputs = self.model._get_mask_head_input(boxes, pixel_embedding)
self.assertEqual(mask_inputs.shape, (2, 16, 16, 6))
y_grid, x_grid = tf.meshgrid(np.linspace(-1.0, 1.0, 16),
np.linspace(-1.0, 1.0, 16), indexing='ij')
for i in range(2):
mask_input = mask_inputs[i]
self.assertAllClose(y_grid, mask_input[:, :, 0])
self.assertAllClose(x_grid, mask_input[:, :, 1])
pixel_embedding = mask_input[:, :, 2:]
self.assertAllClose(np.zeros((16, 16, 4)) + i + 1, pixel_embedding)
def test_get_mask_head_input_no_crop_resize(self):
model = build_meta_arch(predict_full_resolution_masks=True)
boxes = tf.constant([[0., 0., 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]],
dtype=tf.float32)
pixel_embedding_np = np.random.randn(32, 32, 4).astype(np.float32)
pixel_embedding = tf.constant(pixel_embedding_np)
mask_inputs = model._get_mask_head_input(boxes, pixel_embedding)
self.assertEqual(mask_inputs.shape, (2, 32, 32, 6))
y_grid, x_grid = tf.meshgrid(np.linspace(-1.0, 1.0, 32),
np.linspace(-1.0, 1.0, 32), indexing='ij')
for i in range(2):
mask_input = mask_inputs[i]
self.assertAllClose(y_grid, mask_input[:, :, 0])
self.assertAllClose(x_grid, mask_input[:, :, 1])
pixel_embedding = mask_input[:, :, 2:]
self.assertAllClose(pixel_embedding_np, pixel_embedding)
def test_get_instance_embeddings(self):
embeddings = np.zeros((32, 32, 2))
embeddings[8, 8] = 1.0
embeddings[24, 16] = 2.0
embeddings = tf.constant(embeddings)
boxes = tf.constant([[0., 0., 0.5, 0.5], [0.5, 0.0, 1.0, 1.0]])
center_embeddings = self.model._get_instance_embeddings(boxes, embeddings)
self.assertAllClose(center_embeddings, [[1.0, 1.0], [2.0, 2.0]])
def test_get_groundtruth_mask_output(self):
boxes = tf.constant([[0., 0., 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]],
dtype=tf.float32)
masks = np.zeros((2, 32, 32), dtype=np.float32)
masks[0, :16, :16] = 0.5
masks[1, 16:, 16:] = 0.1
masks = self.model._get_groundtruth_mask_output(boxes, masks)
self.assertEqual(masks.shape, (2, 16, 16))
self.assertAllClose(masks[0], np.zeros((16, 16)) + 0.5)
self.assertAllClose(masks[1], np.zeros((16, 16)) + 0.1)
def test_get_groundtruth_mask_output_crop_resize(self):
model = build_meta_arch(predict_full_resolution_masks=True)
boxes = tf.constant([[0., 0., 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]],
dtype=tf.float32)
masks = tf.ones((2, 32, 32))
masks = model._get_groundtruth_mask_output(boxes, masks)
self.assertAllClose(masks, np.ones((2, 32, 32)))
def test_per_instance_loss(self):
model = build_meta_arch()
model._mask_net = MockMaskNet()
boxes = tf.constant([[0.0, 0.0, 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]])
masks = np.zeros((2, 32, 32), dtype=np.float32)
masks[0, :16, :16] = 1.0
masks[1, 16:, 16:] = 1.0
masks = tf.constant(masks)
loss = model._compute_per_instance_mask_loss(
boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
self.assertAllClose(
loss, np.zeros(2) - tf.math.log(tf.nn.sigmoid(0.9)))
def test_per_instance_loss_no_crop_resize(self):
model = build_meta_arch(predict_full_resolution_masks=True)
model._mask_net = MockMaskNet()
boxes = tf.constant([[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]])
masks = np.ones((2, 128, 128), dtype=np.float32)
masks = tf.constant(masks)
loss = model._compute_per_instance_mask_loss(
boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
self.assertAllClose(
loss, np.zeros(2) - tf.math.log(tf.nn.sigmoid(0.9)))
def test_per_instance_loss_no_crop_resize_dice(self):
model = build_meta_arch(predict_full_resolution_masks=True,
use_dice_loss=True)
model._mask_net = MockMaskNet()
boxes = tf.constant([[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]])
masks = np.ones((2, 128, 128), dtype=np.float32)
masks = tf.constant(masks)
loss = model._compute_per_instance_mask_loss(
boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
pred = tf.nn.sigmoid(0.9)
expected = (1.0 - ((2.0 * pred) / (1.0 + pred)))
self.assertAllClose(loss, [expected, expected], rtol=1e-3)
def test_empty_masks(self):
boxes = tf.zeros([0, 4])
masks = tf.zeros([0, 128, 128])
loss = self.model._compute_per_instance_mask_loss(
boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)))
self.assertEqual(loss.shape, (0,))
def test_postprocess(self):
model = build_meta_arch()
model._mask_net = MockMaskNet()
boxes = np.zeros((2, 3, 4), dtype=np.float32)
boxes[:, :, [0, 2]] = 0.0
boxes[:, :, [1, 3]] = 8.0
boxes = tf.constant(boxes)
masks = model._postprocess_masks(
boxes, tf.zeros((2, 32, 32, 2)), tf.zeros((2, 32, 32, 2)))
prob = tf.nn.sigmoid(0.9).numpy()
self.assertAllClose(masks, prob * np.ones((2, 3, 16, 16)))
def test_postprocess_no_crop_resize_shape(self):
model = build_meta_arch(predict_full_resolution_masks=True)
model._mask_net = MockMaskNet()
boxes = np.zeros((2, 3, 4), dtype=np.float32)
boxes[:, :, [0, 2]] = 0.0
boxes[:, :, [1, 3]] = 8.0
boxes = tf.constant(boxes)
masks = model._postprocess_masks(
boxes, tf.zeros((2, 32, 32, 2)), tf.zeros((2, 32, 32, 2)))
prob = tf.nn.sigmoid(0.9).numpy()
self.assertAllClose(masks, prob * np.ones((2, 3, 128, 128)))
def test_crop_masks_within_boxes(self):
masks = np.zeros((2, 32, 32))
masks[0, :16, :16] = 1.0
masks[1, 16:, 16:] = 1.0
boxes = tf.constant([[0.0, 0.0, 15.0 / 32, 15.0 / 32],
[0.5, 0.5, 1.0, 1]])
masks = deepmac_meta_arch.crop_masks_within_boxes(
masks, boxes, 128)
masks = (masks.numpy() > 0.0).astype(np.float32)
self.assertAlmostEqual(masks.sum(), 2 * 128 * 128)
def test_transform_boxes_to_feature_coordinates(self):
batch_size = 2
model = build_meta_arch()
model._mask_net = MockMaskNet()
boxes = np.zeros((batch_size, 3, 4), dtype=np.float32)
boxes[:, :, [0, 2]] = 0.1
boxes[:, :, [1, 3]] = 0.5
boxes = tf.constant(boxes)
true_image_shapes = tf.constant([
[64, 32, 3], # Image 1 is padded during resizing.
[64, 64, 3], # Image 2 is not padded.
])
resized_image_height = 64
resized_image_width = 64
resized_image_shape = [
batch_size, resized_image_height, resized_image_width, 3
]
feature_map_height = 32
feature_map_width = 32
instance_embedding = tf.zeros(
(batch_size, feature_map_height, feature_map_width, 2))
expected_boxes = np.array([
[ # Image 1
# 0.1 * (64 / resized_image_height) * feature_map_height -> 3.2
# 0.5 * (32 / resized_image_width) * feature_map_width -> 8.0
[3.2, 8., 3.2, 8.],
[3.2, 8., 3.2, 8.],
[3.2, 8., 3.2, 8.],
],
[ # Image 2
# 0.1 * (64 / resized_image_height) * feature_map_height -> 3.2
# 0.5 * (64 / resized_image_width) * feature_map_width -> 16
[3.2, 16., 3.2, 16.],
[3.2, 16., 3.2, 16.],
[3.2, 16., 3.2, 16.],
],
])
box_strided = model._transform_boxes_to_feature_coordinates(
boxes, true_image_shapes, resized_image_shape, instance_embedding)
self.assertAllClose(box_strided, expected_boxes)
def test_fc_tf_function(self):
net = deepmac_meta_arch.MaskHeadNetwork('fully_connected', 8, mask_size=32)
call_func = tf.function(net.__call__)
out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 8)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class FullyConnectedMaskHeadTest(tf.test.TestCase):
def test_fc_mask_head(self):
head = deepmac_meta_arch.FullyConnectedMaskHead(512, 16)
inputs = tf.random.uniform([100, 16, 16, 512])
output = head(inputs)
self.assertAllEqual([100, 16, 16, 1], output.numpy().shape)
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class ResNetMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(['resnet4', 'resnet8', 'resnet20'])
def test_pass(self, name):
net = deepmac_meta_arch.ResNetMaskNetwork(name, 8)
out = net(tf.zeros((3, 32, 32, 16)))
self.assertEqual(out.shape[:3], (3, 32, 32))
if __name__ == '__main__':
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment