Unverified Commit 09d9656f authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

parents ac671306 49a5706c
...@@ -135,7 +135,15 @@ class SemanticSegmentationTask(base_task.Task): ...@@ -135,7 +135,15 @@ class SemanticSegmentationTask(base_task.Task):
use_groundtruth_dimension=loss_params.use_groundtruth_dimension, use_groundtruth_dimension=loss_params.use_groundtruth_dimension,
top_k_percent_pixels=loss_params.top_k_percent_pixels) top_k_percent_pixels=loss_params.top_k_percent_pixels)
total_loss = segmentation_loss_fn(model_outputs, labels['masks']) total_loss = segmentation_loss_fn(model_outputs['logits'], labels['masks'])
if 'mask_scores' in model_outputs:
mask_scoring_loss_fn = segmentation_losses.MaskScoringLoss(
loss_params.ignore_label)
total_loss += mask_scoring_loss_fn(
model_outputs['mask_scores'],
model_outputs['logits'],
labels['masks'])
if aux_losses: if aux_losses:
total_loss += tf.add_n(aux_losses) total_loss += tf.add_n(aux_losses)
...@@ -144,6 +152,28 @@ class SemanticSegmentationTask(base_task.Task): ...@@ -144,6 +152,28 @@ class SemanticSegmentationTask(base_task.Task):
return total_loss return total_loss
def process_metrics(self, metrics, labels, model_outputs, **kwargs):
"""Process and update metrics.
Called when using custom training loop API.
Args:
metrics: a nested structure of metrics objects. The return of function
self.build_metrics.
labels: a tensor or a nested structure of tensors.
model_outputs: a tensor or a nested structure of tensors. For example,
output of the keras model built by self.build_model.
**kwargs: other args.
"""
for metric in metrics:
if 'mask_scores_mse' is metric.name:
actual_mask_scores = segmentation_losses.get_actual_mask_scores(
model_outputs['logits'], labels['masks'],
self.task_config.losses.ignore_label)
metric.update_state(actual_mask_scores, model_outputs['mask_scores'])
else:
metric.update_state(labels, model_outputs['logits'])
def build_metrics(self, training: bool = True): def build_metrics(self, training: bool = True):
"""Gets streaming metrics for training/validation.""" """Gets streaming metrics for training/validation."""
metrics = [] metrics = []
...@@ -153,6 +183,9 @@ class SemanticSegmentationTask(base_task.Task): ...@@ -153,6 +183,9 @@ class SemanticSegmentationTask(base_task.Task):
num_classes=self.task_config.model.num_classes, num_classes=self.task_config.model.num_classes,
rescale_predictions=False, rescale_predictions=False,
dtype=tf.float32)) dtype=tf.float32))
if self.task_config.model.get('mask_scoring_head'):
metrics.append(
tf.keras.metrics.MeanSquaredError(name='mask_scores_mse'))
else: else:
self.iou_metric = segmentation_metrics.PerClassIoU( self.iou_metric = segmentation_metrics.PerClassIoU(
name='per_class_iou', name='per_class_iou',
...@@ -160,6 +193,11 @@ class SemanticSegmentationTask(base_task.Task): ...@@ -160,6 +193,11 @@ class SemanticSegmentationTask(base_task.Task):
rescale_predictions=not self.task_config.validation_data rescale_predictions=not self.task_config.validation_data
.resize_eval_groundtruth, .resize_eval_groundtruth,
dtype=tf.float32) dtype=tf.float32)
if self.task_config.validation_data.resize_eval_groundtruth and self.task_config.model.get('mask_scoring_head'): # pylint: disable=line-too-long
# Masks scores metric can only be computed if labels are scaled to match
# preticted mask scores.
metrics.append(
tf.keras.metrics.MeanSquaredError(name='mask_scores_mse'))
# Update state on CPU if TPUStrategy due to dynamic resizing. # Update state on CPU if TPUStrategy due to dynamic resizing.
self._process_iou_metric_on_cpu = isinstance( self._process_iou_metric_on_cpu = isinstance(
...@@ -194,6 +232,8 @@ class SemanticSegmentationTask(base_task.Task): ...@@ -194,6 +232,8 @@ class SemanticSegmentationTask(base_task.Task):
num_replicas = tf.distribute.get_strategy().num_replicas_in_sync num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
outputs = model(features, training=True) outputs = model(features, training=True)
if isinstance(outputs, tf.Tensor):
outputs = {'logits': outputs}
# Casting output layer as float32 is necessary when mixed_precision is # Casting output layer as float32 is necessary when mixed_precision is
# mixed_float16 or mixed_bfloat16 to ensure output is casted as float32. # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32.
outputs = tf.nest.map_structure( outputs = tf.nest.map_structure(
...@@ -249,6 +289,8 @@ class SemanticSegmentationTask(base_task.Task): ...@@ -249,6 +289,8 @@ class SemanticSegmentationTask(base_task.Task):
features, input_partition_dims) features, input_partition_dims)
outputs = self.inference_step(features, model) outputs = self.inference_step(features, model)
if isinstance(outputs, tf.Tensor):
outputs = {'logits': outputs}
outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs) outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs)
if self.task_config.validation_data.resize_eval_groundtruth: if self.task_config.validation_data.resize_eval_groundtruth:
...@@ -260,9 +302,9 @@ class SemanticSegmentationTask(base_task.Task): ...@@ -260,9 +302,9 @@ class SemanticSegmentationTask(base_task.Task):
logs = {self.loss: loss} logs = {self.loss: loss}
if self._process_iou_metric_on_cpu: if self._process_iou_metric_on_cpu:
logs.update({self.iou_metric.name: (labels, outputs)}) logs.update({self.iou_metric.name: (labels, outputs['logits'])})
else: else:
self.iou_metric.update_state(labels, outputs) self.iou_metric.update_state(labels, outputs['logits'])
if metrics: if metrics:
self.process_metrics(metrics, labels, outputs) self.process_metrics(metrics, labels, outputs)
......
# Object Detection Models on TensorFlow 2 # Object Detection Models on TensorFlow 2
**WARNING**: This repository will be deprecated and replaced by the solid This repository is deprecated and replaced by the solid
implementations inside vision/beta/. implementations inside vision/beta/. All the content has been moved to
[official/legacy/detection](https://github.com/tensorflow/models/tree/master/official/legacy/detection).
## Prerequsite
To get started, download the code from TensorFlow models GitHub repository or
use the pre-installed Google Cloud VM.
```bash
git clone https://github.com/tensorflow/models.git
```
Next, make sure to use TensorFlow 2.1+ on Google Cloud. Also here are
a few package you need to install to get started:
```bash
sudo apt-get install -y python-tk && \
pip3 install -r ~/models/official/requirements.txt
```
## Train RetinaNet on TPU
### Train a vanilla ResNet-50 based RetinaNet.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--params_override="{ type: retinanet, train: { checkpoint: { path: ${RESNET_CHECKPOINT?}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
```
The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
Note: The ResNet implementation under
[detection/](https://github.com/tensorflow/models/tree/master/official/vision/detection)
is currently different from the one under
[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
so the checkpoints are not compatible.
We will unify the implementation soon.
### Train a SpineNet-49 based RetinaNet.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--params_override="{ type: retinanet, architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
```
### Train a custom RetinaNet using the config file.
First, create a YAML config file, e.g. *my_retinanet.yaml*. This file specifies
the parameters to be overridden, which should at least include the following
fields.
```YAML
# my_retinanet.yaml
type: 'retinanet'
train:
train_file_pattern: <path to the TFRecord training data>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--config_file="my_retinanet.yaml"
```
## Train RetinaNet on GPU
Training on GPU is similar to that on TPU. The major change is the strategy
type (use "[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)" for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)" for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--config_file="my_retinanet.yaml"
```
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--config_file="my_retinanet.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/vision/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
predict:
predict_batch_size: 8
architecture:
use_bfloat16: False
train:
total_steps: 1
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
---
## Train Mask R-CNN on TPU
### Train a vanilla ResNet-50 based Mask R-CNN.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } }"
```
The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
Note: The ResNet implementation under
[detection/](https://github.com/tensorflow/models/tree/master/official/vision/detection)
is currently different from the one under
[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
so the checkpoints are not compatible.
We will unify the implementation soon.
### Train a SpineNet-49 based Mask R-CNN.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--model=mask_rcnn \
--params_override="{architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
```
### Train a custom Mask R-CNN using the config file.
First, create a YAML config file, e.g. *my_maskrcnn.yaml*.
This file specifies the parameters to be overridden,
which should at least include the following fields.
```YAML
# my_maskrcnn.yaml
train:
train_file_pattern: <path to the TFRecord training data>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
## Train Mask R-CNN on GPU
Training on GPU is similar to that on TPU. The major change is the strategy type
(use
"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/vision/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--model=mask_rcnn \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
predict:
predict_batch_size: 8
architecture:
use_bfloat16: False
train:
total_steps: 1000
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
## Train ShapeMask on TPU
### Train a ResNet-50 based ShapeMask.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
SHAPE_PRIOR_PATH="<path to shape priors>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } shapemask_head: {use_category_for_mask: true, shape_prior_path: ${SHAPE_PRIOR_PATH}} }"
```
The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
The shape priors can be downloaded [here]
(https://storage.googleapis.com/cloud-tpu-checkpoints/shapemask/kmeans_class_priors_91x20x32x32.npy)
### Train a custom ShapeMask using the config file.
First, create a YAML config file, e.g. *my_shapemask.yaml*.
This file specifies the parameters to be overridden:
```YAML
# my_shapemask.yaml
train:
train_file_pattern: <path to the TFRecord training data>
total_steps: <total steps to train>
batch_size: <training batch size>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
batch_size: <evaluation batch size>
shapemask_head:
shape_prior_path: <path to shape priors>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--config_file="my_shapemask.yaml"
```
## Train ShapeMask on GPU
Training on GPU is similar to that on TPU. The major change is the strategy type
(use
"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--config_file="my_shapemask.yaml"
```
A single GPU example
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--config_file="my_shapemask.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/vision/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--model=shapemask \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
train:
total_steps: 1000
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
### Run the evaluation (after training)
```
python3 /usr/share/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=eval \
--model=shapemask \
--params_override="{eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN}, eval_samples: 5000 } }"
```
`MODEL_DIR` needs to point to the trained path of ShapeMask model.
Change `strategy_type=mirrored` and `num_gpus=1` to run on a GPU.
Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be
downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files.
## References
1. [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002).
Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, and Piotr Dollár. IEEE
International Conference on Computer Vision (ICCV), 2017.
...@@ -12,3 +12,5 @@ ...@@ -12,3 +12,5 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Deprecating the vision/detection folder."""
raise ImportError('This module has been moved to official/legacy/detection')
# Image Classification This repository is deprecated and replaced by the solid
implementations inside vision/beta/. All the content has been moved to
**Warning:** the features in the `image_classification/` folder have been fully [official/legacy/image_classification](https://github.com/tensorflow/models/tree/master/official/legacy/image_classification).
intergrated into vision/beta. Please use the [new code base](../beta/README.md).
This folder contains TF 2.0 model examples for image classification:
* [MNIST](#mnist)
* [Classifier Trainer](#classifier-trainer), a framework that uses the Keras
compile/fit methods for image classification models, including:
* ResNet
* EfficientNet[^1]
[^1]: Currently a work in progress. We cannot match "AutoAugment (AA)" in [the original version](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet).
For more information about other types of models, please refer to this
[README file](../../README.md).
## Before you begin
Please make sure that you have the latest version of TensorFlow
installed and
[add the models folder to your Python path](/official/#running-the-models).
### ImageNet preparation
#### Using TFDS
`classifier_trainer.py` supports ImageNet with
[TensorFlow Datasets (TFDS)](https://www.tensorflow.org/datasets/overview).
Please see the following [example snippet](https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/scripts/download_and_prepare.py)
for more information on how to use TFDS to download and prepare datasets, and
specifically the [TFDS ImageNet readme](https://github.com/tensorflow/datasets/blob/master/docs/catalog/imagenet2012.md)
for manual download instructions.
#### Legacy TFRecords
Download the ImageNet dataset and convert it to TFRecord format.
The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
provide a few options.
Note that the legacy ResNet runners, e.g. [resnet/resnet_ctl_imagenet_main.py](resnet/resnet_ctl_imagenet_main.py)
require TFRecords whereas `classifier_trainer.py` can use both by setting the
builder to 'records' or 'tfds' in the configurations.
### Running on Cloud TPUs
Note: These models will **not** work with TPUs on Colab.
You can train image classification models on Cloud TPUs using
[tf.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf.distribute.TPUStrategy?version=nightly).
If you are not familiar with Cloud TPUs, it is strongly recommended that you go
through the
[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
create a TPU and GCE VM.
### Running on multiple GPU hosts
You can also train these models on multiple hosts, each with GPUs, using
[tf.distribute.experimental.MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy).
The easiest way to run multi-host benchmarks is to set the
[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
appropriately at each host. e.g., to run using `MultiWorkerMirroredStrategy` on
2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
"index": i}`. `MultiWorkerMirroredStrategy` will automatically use all the
available GPUs at each host.
## MNIST
To download the data and run the MNIST sample model locally for the first time,
run one of the following command:
```bash
python3 mnist_main.py \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--train_epochs=10 \
--distribution_strategy=one_device \
--num_gpus=$NUM_GPUS \
--download
```
To train the model on a Cloud TPU, run the following command:
```bash
python3 mnist_main.py \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--train_epochs=10 \
--distribution_strategy=tpu \
--download
```
Note: the `--download` flag is only required the first time you run the model.
## Classifier Trainer
The classifier trainer is a unified framework for running image classification
models using Keras's compile/fit methods. Experiments should be provided in the
form of YAML files, some examples are included within the configs/examples
folder. Please see [configs/examples](./configs/examples) for more example
configurations.
The provided configuration files use a per replica batch size and is scaled
by the number of devices. For instance, if `batch size` = 64, then for 1 GPU
the global batch size would be 64 * 1 = 64. For 8 GPUs, the global batch size
would be 64 * 8 = 512. Similarly, for a v3-8 TPU, the global batch size would
be 64 * 8 = 512, and for a v3-32, the global batch size is 64 * 32 = 2048.
### ResNet50
#### On GPU:
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=resnet \
--dataset=imagenet \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/resnet/imagenet/gpu.yaml \
--params_override='runtime.num_gpus=$NUM_GPUS'
```
To train on multiple hosts, each with GPUs attached using
[MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy)
please update `runtime` section in gpu.yaml
(or override using `--params_override`) with:
```YAML
# gpu.yaml
runtime:
distribution_strategy: 'multi_worker_mirrored'
worker_hosts: '$HOST1:port,$HOST2:port'
num_gpus: $NUM_GPUS
task_index: 0
```
By having `task_index: 0` on the first host and `task_index: 1` on the second
and so on. `$HOST1` and `$HOST2` are the IP addresses of the hosts, and `port`
can be chosen any free port on the hosts. Only the first host will write
TensorBoard Summaries and save checkpoints.
#### On TPU:
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=resnet \
--dataset=imagenet \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/resnet/imagenet/tpu.yaml
```
### EfficientNet
**Note: EfficientNet development is a work in progress.**
#### On GPU:
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=efficientnet \
--dataset=imagenet \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml \
--params_override='runtime.num_gpus=$NUM_GPUS'
```
#### On TPU:
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=efficientnet \
--dataset=imagenet \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
```
Note that the number of GPU devices can be overridden in the command line using
`--params_overrides`. The TPU does not need this override as the device is fixed
by providing the TPU address or name with the `--tpu` flag.
...@@ -12,3 +12,6 @@ ...@@ -12,3 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Deprecating the vision/detection folder."""
raise ImportError(
'This module has been moved to official/legacy/image_classification')
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
"""Utilities for creating loop functions.""" """Utilities for creating loop functions."""
from absl import logging
from orbit.utils import tpu_summaries from orbit.utils import tpu_summaries
import tensorflow as tf import tensorflow as tf
...@@ -65,8 +66,8 @@ def create_loop_fn(step_fn): ...@@ -65,8 +66,8 @@ def create_loop_fn(step_fn):
The final state returned by `reduce_fn`, or `None` if `state` and The final state returned by `reduce_fn`, or `None` if `state` and
`reduce_fn` are not provided. `reduce_fn` are not provided.
""" """
step = 0
try: try:
step = 0
# To make sure the OutOfRangeError exception can be handled well under # To make sure the OutOfRangeError exception can be handled well under
# async remote eager, we need to wrap the loop body in `async_scope`. # async remote eager, we need to wrap the loop body in `async_scope`.
with tf.experimental.async_scope(): with tf.experimental.async_scope():
...@@ -77,6 +78,7 @@ def create_loop_fn(step_fn): ...@@ -77,6 +78,7 @@ def create_loop_fn(step_fn):
step += 1 step += 1
return state return state
except (StopIteration, tf.errors.OutOfRangeError): except (StopIteration, tf.errors.OutOfRangeError):
logging.info("The dataset iterator is exhausted after %d steps.", step)
tf.experimental.async_clear_error() tf.experimental.async_clear_error()
return state return state
......
## Global features: CNN Image Retrieval
This Python toolbox implements the training and testing of the approach described in the papers:
[![Paper](http://img.shields.io/badge/paper-arXiv.2001.05027-B3181B.svg)](https://arxiv.org/abs/1711.02512)
```
"Fine-tuning CNN Image Retrieval with No Human Annotation",
Radenović F., Tolias G., Chum O.,
TPAMI 2018
```
[![Paper](http://img.shields.io/badge/paper-arXiv.2001.05027-B3181B.svg)](http://arxiv.org/abs/1604.02426)
```
"CNN Image Retrieval Learns from BoW: Unsupervised Fine-Tuning with Hard Examples",
Radenović F., Tolias G., Chum O.,
ECCV 2016
```
Fine-tuned CNNs are used for global feature extraction with the goal of using
those for image retrieval. The networks are trained on the <i>SfM120k</i>
landmark images dataset.
<img src="http://cmp.felk.cvut.cz/cnnimageretrieval/img/cnnimageretrieval_network_medium.png" width=\textwidth/>
When initializing the network, one of the popular pre-trained architectures
for classification tasks (such as ResNet or VGG) is used as the network’s
backbone. The
fully connected layers of such architectures are discarded, resulting in a fully
convolutional backbone. Then, given an input image of the size [W × H × C],
where C is the number of channels, W and H are image width and height,
respectively; the output is a tensor X with dimensions [W' × H' × K], where
K is the number of feature maps in the last layer. Tensor X
can be considered as a set of the input image’s deep local features. For
deep convolutional features, the simple aggregation approach based on global
pooling arguably provides the best results. This method is fast, has a small
number of parameters, and a low risk of overfitting. Keeping this in mind,
we convert local features to a global descriptor vector using one of the
retrieval system’s global poolings (MAC, SPoC, or GeM). After this stage,
the feature vector is made up of the maximum activation per feature map
with dimensionality equal to K. The final output dimensionality for the most
common networks varies from 512 to 2048, making this image representation
relatively compact.
Vectors that have been pooled are subsequently L2-normalized. The obtained
representation is then optionally passed through the fully connected
layers before being subjected to a
new L2 re-normalization. The finally produced image representation allows
comparing the resemblance of two images by simply using their inner product.
### Install DELF library
To be able to use this code, please follow
[these instructions](../../../../INSTALL_INSTRUCTIONS.md) to properly install
the DELF library.
### Usage
<details>
<summary><b>Training</b></summary><br/>
Navigate (```cd```) to the folder ```[DELF_ROOT/delf/python/training
/global_features].```
Example training script is located in ```DELF_ROOT/delf/python/training/global_features/train.py```.
```
python3 train.py [--arch ARCH] [--batch_size N] [--data_root PATH]
[--debug] [--directory PATH] [--epochs N] [--gpu_id ID]
[--image_size SIZE] [--launch_tensorboard] [--loss LOSS]
[--loss_margin LM] [--lr LR] [--momentum M] [multiscale SCALES]
[--neg_num N] [--optimizer OPTIMIZER] [--pool POOL] [--pool_size N]
[--pretrained] [--precompute_whitening DATASET] [--resume]
[--query_size N] [--test_datasets DATASET] [--test_freq N]
[--test_whiten] [--training_dataset DATASET] [--update_every N]
[--validation_type TYPE] [--weight_decay N] [--whitening]
```
For detailed explanation of the options run:
```
python3 train.py -helpfull
```
Standard training of our models was run with the following parameters:
```
python3 train.py \
--directory="DESTINATION_PATH" \
--gpu_ids='0' \
--data_root="TRAINING_DATA_DIRECTORY" \
--training_dataset='retrieval-SfM-120k' \
--test_datasets='roxford5k,rparis6k' \
--arch='ResNet101' \
--pool='gem' \
--whitening=True \
--debug=True \
--loss='triplet' \
--loss_margin=0.85 \
--optimizer='adam' \
--lr=5e-7 --neg_num=3 --query_size=2000 \
--pool_size=20000 --batch_size=5 \
--image_size=1024 --epochs=100 --test_freq=5 \
--multiscale='[1, 2**(1/2), 1/2**(1/2)]'
```
**Note**: Data and networks used for training and testing are automatically downloaded when using the example training
script (```DELF_ROOT/delf/python/training/global_features/train.py```).
</details>
<details>
<summary><b>Training logic flow</b></summary><br/>
**Initialization phase**
1. Checking if required datasets are downloaded and automatically download them (both test and train/val) if they are
not present in the data folder.
1. Setting up the logging and creating a logging/checkpoint directory.
1. Initialize model according to the user-provided parameters (architecture
/pooling/whitening/pretrained etc.).
1. Defining loss (contrastive/triplet) according to the user parameters.
1. Defining optimizer (Adam/SGD with learning rate/weight decay/momentum) according to the user parameters.
1. Initializing CheckpointManager and resuming from the latest checkpoint if the resume flag is set.
1. Launching Tensorboard if the flag is set.
1. Initializing training (and validation, if required) datasets.
1. Freezing BatchNorm weights update, since we we do training for one image at a time so the statistics would not be per batch, hence we choose freezing (i.e., using pretrained imagenet statistics).
1. Evaluating the network performance before training (on the test datasets).
**Training phase**
The main training loop (for the required number of epochs):
1. Finding the hard negative pairs in the dataset (using the forward pass through the model)
1. Creating the training dataset from generator which changes every epoch. Each
element in the dataset consists of 1 x Positive image, 1 x Query image
, N x Hard negative images (N is specified by the `num_neg` flag), an array
specifying the Positive (-1), Query (0), Negative (1) images.
1. Performing one training step and calculating the final epoch loss.
1. If validation is required, finding hard negatives in the validation set
, which has the same structure as the training set. Performing one validation
step and calculating the loss.
1. Evaluating on the test datasets every `test_freq` epochs.
1. Saving checkpoint (optimizer and the model weights).
</details>
## Exporting the Trained Model
Assuming the training output, the TensorFlow checkpoint, is located in the
`--directory` path. The following code exports the model:
```
python3 model/export_CNN_global_model.py \
[--ckpt_path PATH] [--export_path PATH] [--input_scales_list LIST]
[--multi_scale_pool_type TYPE] [--normalize_global_descriptor BOOL]
[arch ARCHITECTURE] [pool POOLING] [whitening BOOL]
```
*NOTE:* Path to the checkpoint must include .h5 file.
## Testing the trained model
After the trained model has been exported, it can be used to extract global
features similarly as for the DELG model. Please follow
[these instructions](https://github.com/tensorflow/models/tree/master/research/delf/delf/python/training#testing-the-trained-model).
After training the standard training setup for 100 epochs, the
following results are obtained on Roxford and RParis datasets under a single
-scale evaluation:
```
>> roxford5k: mAP E: 74.88, M: 58.28, H: 30.4
>> roxford5k: mP@k[1, 5, 10] E: [89.71 84.8 79.07],
M: [91.43 84.67 78.24],
H: [68.57 53.29 43.29]
>> rparis6k: mAP E: 89.21, M: 73.69, H: 49.1
>> rparis6k: mP@k[1, 5, 10] E: [98.57 97.43 95.57],
M: [98.57 99.14 98.14],
H: [94.29 90. 87.29]
```
\ No newline at end of file
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Export global CNN feature tensorflow inference model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import app
from absl import flags
import tensorflow as tf
from delf.python.training.model import global_model
from delf.python.training.model import export_model_utils
FLAGS = flags.FLAGS
flags.DEFINE_string('ckpt_path', None, help='Path to saved checkpoint.')
flags.DEFINE_string('export_path', None,
help='Path where model will be exported.')
flags.DEFINE_list(
'input_scales_list', None,
'Optional input image scales to use. If None (default), an input '
'end-point '
'"input_scales" is added for the exported model. If not None, the '
'specified list of floats will be hard-coded as the desired input '
'scales.')
flags.DEFINE_enum(
'multi_scale_pool_type', 'None', ['None', 'average', 'sum'],
"If 'None' (default), the model is exported with an output end-point "
"'global_descriptors', where the global descriptor for each scale is "
"returned separately. If not 'None', the global descriptor of each "
"scale is"
' pooled and a 1D global descriptor is returned, with output end-point '
"'global_descriptor'.")
flags.DEFINE_boolean('normalize_global_descriptor', False,
'If True, L2-normalizes global descriptor.')
# Network architecture and initialization options.
flags.DEFINE_string('arch', 'ResNet101',
'model architecture (default: ResNet101)')
flags.DEFINE_string('pool', 'gem', 'pooling options (default: gem)')
flags.DEFINE_boolean('whitening', False,
'train model with learnable whitening (linear layer) '
'after the pooling')
def _NormalizeImages(images, *args):
"""Normalize pixel values in image.
Args:
images: `Tensor`, images to normalize.
Returns:
normalized_images: `Tensor`, normalized images.
"""
tf.keras.applications.imagenet_utils.preprocess_input(images, mode='caffe')
return images
class _ExtractModule(tf.Module):
"""Helper module to build and save global feature model."""
def __init__(self,
multi_scale_pool_type='None',
normalize_global_descriptor=False,
input_scales_tensor=None):
"""Initialization of global feature model.
Args:
multi_scale_pool_type: Type of multi-scale pooling to perform.
normalize_global_descriptor: Whether to L2-normalize global
descriptor.
input_scales_tensor: If None, the exported function to be used
should be ExtractFeatures, where an input end-point "input_scales" is
added for the exported model. If not None, the specified 1D tensor of
floats will be hard-coded as the desired input scales, in conjunction
with ExtractFeaturesFixedScales.
"""
self._multi_scale_pool_type = multi_scale_pool_type
self._normalize_global_descriptor = normalize_global_descriptor
if input_scales_tensor is None:
self._input_scales_tensor = []
else:
self._input_scales_tensor = input_scales_tensor
self._model = global_model.GlobalFeatureNet(
FLAGS.arch, FLAGS.pool, FLAGS.whitening, pretrained=False)
def LoadWeights(self, checkpoint_path):
self._model.load_weights(checkpoint_path)
@tf.function(input_signature=[
tf.TensorSpec(shape=[None, None, 3], dtype=tf.uint8,
name='input_image'),
tf.TensorSpec(shape=[None], dtype=tf.float32, name='input_scales'),
tf.TensorSpec(shape=[None], dtype=tf.int32,
name='input_global_scales_ind')
])
def ExtractFeatures(self, input_image, input_scales,
input_global_scales_ind):
extracted_features = export_model_utils.ExtractGlobalFeatures(
input_image,
input_scales,
input_global_scales_ind,
lambda x: self._model(x, training=False),
multi_scale_pool_type=self._multi_scale_pool_type,
normalize_global_descriptor=self._normalize_global_descriptor,
normalization_function=_NormalizeImages())
named_output_tensors = {}
named_output_tensors['global_descriptors'] = tf.identity(
extracted_features, name='global_descriptors')
return named_output_tensors
@tf.function(input_signature=[
tf.TensorSpec(shape=[None, None, 3], dtype=tf.uint8, name='input_image')
])
def ExtractFeaturesFixedScales(self, input_image):
return self.ExtractFeatures(input_image, self._input_scales_tensor,
tf.range(tf.size(self._input_scales_tensor)))
def main(argv):
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
export_path = FLAGS.export_path
if os.path.exists(export_path):
raise ValueError('export_path %s already exists.' % export_path)
if FLAGS.input_scales_list is None:
input_scales_tensor = None
else:
input_scales_tensor = tf.constant(
[float(s) for s in FLAGS.input_scales_list],
dtype=tf.float32,
shape=[len(FLAGS.input_scales_list)],
name='input_scales')
module = _ExtractModule(FLAGS.multi_scale_pool_type,
FLAGS.normalize_global_descriptor,
input_scales_tensor)
# Load the weights.
checkpoint_path = FLAGS.ckpt_path
module.LoadWeights(checkpoint_path)
print('Checkpoint loaded from ', checkpoint_path)
# Save the module.
if FLAGS.input_scales_list is None:
served_function = module.ExtractFeatures
else:
served_function = module.ExtractFeaturesFixedScales
tf.saved_model.save(
module, export_path, signatures={'serving_default': served_function})
if __name__ == '__main__':
app.run(main)
...@@ -183,7 +183,8 @@ def ExtractGlobalFeatures(image, ...@@ -183,7 +183,8 @@ def ExtractGlobalFeatures(image,
global_scales_ind, global_scales_ind,
model_fn, model_fn,
multi_scale_pool_type='None', multi_scale_pool_type='None',
normalize_global_descriptor=False): normalize_global_descriptor=False,
normalization_function=gld.NormalizeImages):
"""Extract global features for input image. """Extract global features for input image.
Args: Args:
...@@ -201,6 +202,7 @@ def ExtractGlobalFeatures(image, ...@@ -201,6 +202,7 @@ def ExtractGlobalFeatures(image,
and a 1D global descriptor is returned. and a 1D global descriptor is returned.
normalize_global_descriptor: If True, output global descriptors are normalize_global_descriptor: If True, output global descriptors are
L2-normalized. L2-normalized.
normalization_function: Function used for normalization.
Returns: Returns:
global_descriptors: If `multi_scale_pool_type` is 'None', returns a [S, D] global_descriptors: If `multi_scale_pool_type` is 'None', returns a [S, D]
...@@ -213,7 +215,7 @@ def ExtractGlobalFeatures(image, ...@@ -213,7 +215,7 @@ def ExtractGlobalFeatures(image,
""" """
original_image_shape_float = tf.gather( original_image_shape_float = tf.gather(
tf.dtypes.cast(tf.shape(image), tf.float32), [0, 1]) tf.dtypes.cast(tf.shape(image), tf.float32), [0, 1])
image_tensor = gld.NormalizeImages( image_tensor = normalization_function(
image, pixel_value_offset=128.0, pixel_value_scale=128.0) image, pixel_value_offset=128.0, pixel_value_scale=128.0)
image_tensor = tf.expand_dims(image_tensor, 0, name='image/expand_dims') image_tensor = tf.expand_dims(image_tensor, 0, name='image/expand_dims')
......
{ {
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Generate_SSD_anchor_box_aspect_ratios_using_k_means_clustering.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
...@@ -55,20 +39,22 @@ ...@@ -55,20 +39,22 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "hCQlBGJkZTR2" "id": "hCQlBGJkZTR2"
}, },
"outputs": [],
"source": [ "source": [
"import tensorflow as tf" "import tensorflow as tf"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "aw-Ba-5RUhMs" "id": "aw-Ba-5RUhMs"
}, },
"outputs": [],
"source": [ "source": [
"# Install the tensorflow Object Detection API...\n", "# Install the tensorflow Object Detection API...\n",
"# If you're running this offline, you also might need to install the protobuf-compiler:\n", "# If you're running this offline, you also might need to install the protobuf-compiler:\n",
...@@ -87,9 +73,7 @@ ...@@ -87,9 +73,7 @@
"\n", "\n",
"# Test the installation\n", "# Test the installation\n",
"! python object_detection/builders/model_builder_tf2_test.py" "! python object_detection/builders/model_builder_tf2_test.py"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
...@@ -113,19 +97,21 @@ ...@@ -113,19 +97,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "sKYfhq7CKZ4B" "id": "sKYfhq7CKZ4B"
}, },
"outputs": [],
"source": [ "source": [
"%mkdir /content/dataset\n", "%mkdir /content/dataset\n",
"%cd /content/dataset\n", "%cd /content/dataset\n",
"! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz\n", "! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz\n",
"! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz\n", "! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz\n",
"! tar zxf images.tar.gz\n", "! tar zxf images.tar.gz\n",
"! tar zxf annotations.tar.gz" "! tar zxf annotations.tar.gz\n",
], "\n",
"execution_count": null, "XML_PATH = '/content/dataset/annotations/xmls'"
"outputs": [] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
...@@ -133,28 +119,53 @@ ...@@ -133,28 +119,53 @@
"id": "44vtL0nsAqXg" "id": "44vtL0nsAqXg"
}, },
"source": [ "source": [
"In this case, we want to reduce the PETS dataset to match the collection of cats and dogs used to train the model (in [this training notebook](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb)):\n", "Because the following k-means script will process all XML annotations, we want to reduce the PETS dataset to include only the cats and dogs used to train the model (in [this training notebook](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb)). So we delete all annotation files that are **not** Abyssinian or American bulldog:\n",
"\n" "\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "8gcUoBU2K_s7" "id": "ih48zFbl6jM7"
}, },
"outputs": [],
"source": [ "source": [
"! cp /content/dataset/annotations/list.txt /content/dataset/annotations/list_petsdataset.txt\n", "! (cd /content/dataset/annotations/xmls/ \u0026\u0026 \\\n",
"! cp /content/dataset/annotations/trainval.txt /content/dataset/annotations/trainval_petsdataset.txt\n", " find . ! \\( -name 'Abyssinian*' -o -name 'american_bulldog*' \\) -type f -exec rm -f {} \\; )"
"! cp /content/dataset/annotations/test.txt /content/dataset/annotations/test_petsdataset.txt\n", ]
"! grep \"Abyssinian\" /content/dataset/annotations/list_petsdataset.txt > /content/dataset/annotations/list.txt\n", },
"! grep \"american_bulldog\" /content/dataset/annotations/list_petsdataset.txt >> /content/dataset/annotations/list.txt\n", {
"! grep \"Abyssinian\" /content/dataset/annotations/trainval_petsdataset.txt > /content/dataset/annotations/trainval.txt\n", "cell_type": "markdown",
"! grep \"american_bulldog\" /content/dataset/annotations/trainval_petsdataset.txt >> /content/dataset/annotations/trainval.txt\n", "metadata": {
"! grep \"Abyssinian\" /content/dataset/annotations/test_petsdataset.txt > /content/dataset/annotations/test.txt\n", "id": "KG8uraCK-RSM"
"! grep \"american_bulldog\" /content/dataset/annotations/test_petsdataset.txt >> /content/dataset/annotations/test.txt" },
], "source": [
"### Upload your own dataset"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "m0bh_iKD-Xz4"
},
"source": [
"To generate the anchor box ratios for your own dataset, upload a ZIP file with your annotation files (click the **Files** tab on the left, and drag-drop your ZIP file there), and then uncomment the following code to unzip it and specify the path to the directory with your annotation files:"
]
},
{
"cell_type": "code",
"execution_count": null, "execution_count": null,
"outputs": [] "metadata": {
"id": "M0j_vWDR3WkK"
},
"outputs": [],
"source": [
"# %cd /content/\n",
"# !unzip dataset.zip\n",
"\n",
"# XML_PATH = '/content/dataset/annotations/xmls'"
]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
...@@ -188,23 +199,24 @@ ...@@ -188,23 +199,24 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "vCB8Dfs0Xlyv" "id": "vCB8Dfs0Xlyv"
}, },
"outputs": [],
"source": [ "source": [
"import sys\n", "import sys\n",
"import glob\n", "import os\n",
"import numpy as np\n", "import numpy as np\n",
"import xml.etree.ElementTree as ET\n", "import xml.etree.ElementTree as ET\n",
"\n", "\n",
"from sklearn.cluster import KMeans\n", "from sklearn.cluster import KMeans\n",
"\n", "\n",
"def xml_to_boxes(path, classes, rescale_width=None, rescale_height=None):\n", "def xml_to_boxes(path, rescale_width=None, rescale_height=None):\n",
" \"\"\"Extracts bounding-box widths and heights from ground-truth dataset.\n", " \"\"\"Extracts bounding-box widths and heights from ground-truth dataset.\n",
"\n", "\n",
" Args:\n", " Args:\n",
" path : Path to .xml annotation files for your dataset.\n", " path : Path to .xml annotation files for your dataset.\n",
" classes : List of classes that are part of dataset.\n",
" rescale_width : Scaling factor to rescale width of bounding box.\n", " rescale_width : Scaling factor to rescale width of bounding box.\n",
" rescale_height : Scaling factor to rescale height of bounding box.\n", " rescale_height : Scaling factor to rescale height of bounding box.\n",
"\n", "\n",
...@@ -213,23 +225,20 @@ ...@@ -213,23 +225,20 @@
" \"\"\"\n", " \"\"\"\n",
"\n", "\n",
" xml_list = []\n", " xml_list = []\n",
" for clss in classes:\n", " filenames = os.listdir(os.path.join(path))\n",
" for xml_file in glob.glob(path + '/'+clss+'*'):\n", " filenames = [os.path.join(path, f) for f in filenames if (f.endswith('.xml'))]\n",
" if xml_file.endswith('.xml'):\n", " for xml_file in filenames:\n",
" tree = ET.parse(xml_file)\n", " tree = ET.parse(xml_file)\n",
" root = tree.getroot()\n", " root = tree.getroot()\n",
" for member in root.findall('object'):\n", " for member in root.findall('object'):\n",
" bndbox = member.find('bndbox')\n", " bndbox = member.find('bndbox')\n",
" bbox_width = int(bndbox.find('xmax').text) - int(bndbox.find('xmin').text)\n", " bbox_width = int(bndbox.find('xmax').text) - int(bndbox.find('xmin').text)\n",
" bbox_height = int(bndbox.find('ymax').text) - int(bndbox.find('ymin').text)\n", " bbox_height = int(bndbox.find('ymax').text) - int(bndbox.find('ymin').text)\n",
" if rescale_width and rescale_height:\n", " if rescale_width and rescale_height:\n",
" size = root.find('size')\n", " size = root.find('size')\n",
" bbox_width = bbox_width * (rescale_width / int(size.find('width').text))\n", " bbox_width = bbox_width * (rescale_width / int(size.find('width').text))\n",
" bbox_height = bbox_height * (rescale_height / int(size.find('height').text))\n", " bbox_height = bbox_height * (rescale_height / int(size.find('height').text))\n",
"\n", " xml_list.append([bbox_width, bbox_height])\n",
" xml_list.append([bbox_width, bbox_height])\n",
" else:\n",
" continue\n",
" bboxes = np.array(xml_list)\n", " bboxes = np.array(xml_list)\n",
" return bboxes\n", " return bboxes\n",
"\n", "\n",
...@@ -275,10 +284,10 @@ ...@@ -275,10 +284,10 @@
" assert len(bboxes), \"You must provide bounding boxes\"\n", " assert len(bboxes), \"You must provide bounding boxes\"\n",
"\n", "\n",
" normalized_bboxes = bboxes / np.sqrt(bboxes.prod(axis=1, keepdims=True))\n", " normalized_bboxes = bboxes / np.sqrt(bboxes.prod(axis=1, keepdims=True))\n",
"\n", " \n",
" # Using kmeans to find centroids of the width/height clusters\n", " # Using kmeans to find centroids of the width/height clusters\n",
" kmeans = KMeans(\n", " kmeans = KMeans(\n",
" init='random', n_clusters=num_aspect_ratios,random_state=0, max_iter=kmeans_max_iter)\n", " init='random', n_clusters=num_aspect_ratios, random_state=0, max_iter=kmeans_max_iter)\n",
" kmeans.fit(X=normalized_bboxes)\n", " kmeans.fit(X=normalized_bboxes)\n",
" ar = kmeans.cluster_centers_\n", " ar = kmeans.cluster_centers_\n",
"\n", "\n",
...@@ -292,9 +301,7 @@ ...@@ -292,9 +301,7 @@
" aspect_ratios = [w/h for w,h in ar]\n", " aspect_ratios = [w/h for w,h in ar]\n",
"\n", "\n",
" return aspect_ratios, avg_iou_perc" " return aspect_ratios, avg_iou_perc"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
...@@ -323,13 +330,12 @@ ...@@ -323,13 +330,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "cNw-vX3nfl1g" "id": "cNw-vX3nfl1g"
}, },
"outputs": [],
"source": [ "source": [
"classes = ['Abyssinian','american_bulldog']\n",
"xml_path = '/content/dataset/annotations/xmls'\n",
"\n",
"# Tune this based on your accuracy/speed goals as described above\n", "# Tune this based on your accuracy/speed goals as described above\n",
"num_aspect_ratios = 4 # can be [2,3,4,5,6]\n", "num_aspect_ratios = 4 # can be [2,3,4,5,6]\n",
"\n", "\n",
...@@ -342,8 +348,7 @@ ...@@ -342,8 +348,7 @@
"height = 320\n", "height = 320\n",
"\n", "\n",
"# Get the ground-truth bounding boxes for our dataset\n", "# Get the ground-truth bounding boxes for our dataset\n",
"bboxes = xml_to_boxes(path=xml_path, classes=classes,\n", "bboxes = xml_to_boxes(path=XML_PATH, rescale_width=width, rescale_height=height)\n",
" rescale_width=width, rescale_height=height)\n",
"\n", "\n",
"aspect_ratios, avg_iou_perc = kmeans_aspect_ratios(\n", "aspect_ratios, avg_iou_perc = kmeans_aspect_ratios(\n",
" bboxes=bboxes,\n", " bboxes=bboxes,\n",
...@@ -354,9 +359,7 @@ ...@@ -354,9 +359,7 @@
"\n", "\n",
"print('Aspect ratios generated:', [round(ar,2) for ar in aspect_ratios])\n", "print('Aspect ratios generated:', [round(ar,2) for ar in aspect_ratios])\n",
"print('Average IOU with anchors:', avg_iou_perc)" "print('Average IOU with anchors:', avg_iou_perc)"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
...@@ -378,9 +381,11 @@ ...@@ -378,9 +381,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"metadata": { "metadata": {
"id": "AlMffd3rgKW2" "id": "AlMffd3rgKW2"
}, },
"outputs": [],
"source": [ "source": [
"import tensorflow as tf\n", "import tensorflow as tf\n",
"from google.protobuf import text_format\n", "from google.protobuf import text_format\n",
...@@ -404,9 +409,7 @@ ...@@ -404,9 +409,7 @@
" f.write(config_text)\n", " f.write(config_text)\n",
"# Check for updated aspect ratios in the config\n", "# Check for updated aspect ratios in the config\n",
"!cat /content/ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config" "!cat /content/ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config"
], ]
"execution_count": null,
"outputs": []
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
...@@ -441,5 +444,22 @@ ...@@ -441,5 +444,22 @@
"\n" "\n"
] ]
} }
] ],
} "metadata": {
\ No newline at end of file "colab": {
"collapsed_sections": [],
"name": "Generate_SSD_anchor_box_aspect_ratios_using_k_means_clustering.ipynb",
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
...@@ -89,6 +89,7 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)): ...@@ -89,6 +89,7 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
""" """
self._num_classes = num_classes self._num_classes = num_classes
self._groundtruth_lists = {} self._groundtruth_lists = {}
self._training_step = None
super(DetectionModel, self).__init__() super(DetectionModel, self).__init__()
...@@ -132,6 +133,13 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)): ...@@ -132,6 +133,13 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
""" """
return field in self._groundtruth_lists return field in self._groundtruth_lists
@property
def training_step(self):
if self._training_step is None:
raise ValueError('Training step was not provided to the model.')
return self._training_step
@staticmethod @staticmethod
def get_side_inputs(features): def get_side_inputs(features):
"""Get side inputs from input features. """Get side inputs from input features.
...@@ -318,7 +326,9 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)): ...@@ -318,7 +326,9 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
groundtruth_verified_neg_classes=None, groundtruth_verified_neg_classes=None,
groundtruth_not_exhaustive_classes=None, groundtruth_not_exhaustive_classes=None,
groundtruth_keypoint_depths_list=None, groundtruth_keypoint_depths_list=None,
groundtruth_keypoint_depth_weights_list=None): groundtruth_keypoint_depth_weights_list=None,
groundtruth_image_classes=None,
training_step=None):
"""Provide groundtruth tensors. """Provide groundtruth tensors.
Args: Args:
...@@ -389,6 +399,11 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)): ...@@ -389,6 +399,11 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
groundtruth_keypoint_depth_weights_list: a list of 2-D tf.float32 tensors groundtruth_keypoint_depth_weights_list: a list of 2-D tf.float32 tensors
of shape [num_boxes, num_keypoints] containing the weights of the of shape [num_boxes, num_keypoints] containing the weights of the
relative depths. relative depths.
groundtruth_image_classes: A list of 1-D tf.float32 tensors of shape
[num_classes], containing label indices encoded as k-hot of the classes
that are present or not present in the image.
training_step: An integer denoting the current training step. This is
useful when models want to anneal loss terms.
""" """
self._groundtruth_lists[fields.BoxListFields.boxes] = groundtruth_boxes_list self._groundtruth_lists[fields.BoxListFields.boxes] = groundtruth_boxes_list
self._groundtruth_lists[ self._groundtruth_lists[
...@@ -463,11 +478,17 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)): ...@@ -463,11 +478,17 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
self._groundtruth_lists[ self._groundtruth_lists[
fields.InputDataFields fields.InputDataFields
.groundtruth_verified_neg_classes] = groundtruth_verified_neg_classes .groundtruth_verified_neg_classes] = groundtruth_verified_neg_classes
if groundtruth_image_classes:
self._groundtruth_lists[
fields.InputDataFields
.groundtruth_image_classes] = groundtruth_image_classes
if groundtruth_not_exhaustive_classes: if groundtruth_not_exhaustive_classes:
self._groundtruth_lists[ self._groundtruth_lists[
fields.InputDataFields fields.InputDataFields
.groundtruth_not_exhaustive_classes] = ( .groundtruth_not_exhaustive_classes] = (
groundtruth_not_exhaustive_classes) groundtruth_not_exhaustive_classes)
if training_step is not None:
self._training_step = training_step
@abc.abstractmethod @abc.abstractmethod
def regularization_losses(self): def regularization_losses(self):
......
...@@ -925,7 +925,9 @@ class CenterNetCenterHeatmapTargetAssigner(object): ...@@ -925,7 +925,9 @@ class CenterNetCenterHeatmapTargetAssigner(object):
compute_heatmap_sparse=False, compute_heatmap_sparse=False,
keypoint_class_id=None, keypoint_class_id=None,
keypoint_indices=None, keypoint_indices=None,
keypoint_weights_for_center=None): keypoint_weights_for_center=None,
box_heatmap_type='adaptive_gaussian',
heatmap_exponent=1.0):
"""Initializes the target assigner. """Initializes the target assigner.
Args: Args:
...@@ -947,6 +949,17 @@ class CenterNetCenterHeatmapTargetAssigner(object): ...@@ -947,6 +949,17 @@ class CenterNetCenterHeatmapTargetAssigner(object):
the number of keypoints. The object center is calculated by the weighted the number of keypoints. The object center is calculated by the weighted
mean of the keypoint locations. If not provided, the object center is mean of the keypoint locations. If not provided, the object center is
determined by the center of the bounding box (default behavior). determined by the center of the bounding box (default behavior).
box_heatmap_type: str, the algorithm used to compute the box heatmap,
used when calling the assign_center_targets_from_boxes method.
Options are:
'adaptaive_gaussian': A box-size adaptive Gaussian from the original
paper[1].
'iou': IOU based heatmap target where each point is assigned an IOU
based on its location, assuming that it produced a box centered at
that point with the correct size.
heatmap_exponent: float, The generated heatmap is exponentiated with
this number. A number > 1 will result in the heatmap being more peaky
and a number < 1 will cause the heatmap to be more spreadout.
""" """
self._stride = stride self._stride = stride
...@@ -955,6 +968,8 @@ class CenterNetCenterHeatmapTargetAssigner(object): ...@@ -955,6 +968,8 @@ class CenterNetCenterHeatmapTargetAssigner(object):
self._keypoint_class_id = keypoint_class_id self._keypoint_class_id = keypoint_class_id
self._keypoint_indices = keypoint_indices self._keypoint_indices = keypoint_indices
self._keypoint_weights_for_center = keypoint_weights_for_center self._keypoint_weights_for_center = keypoint_weights_for_center
self._box_heatmap_type = box_heatmap_type
self._heatmap_exponent = heatmap_exponent
def assign_center_targets_from_boxes(self, def assign_center_targets_from_boxes(self,
height, height,
...@@ -1018,19 +1033,29 @@ class CenterNetCenterHeatmapTargetAssigner(object): ...@@ -1018,19 +1033,29 @@ class CenterNetCenterHeatmapTargetAssigner(object):
self._min_overlap) self._min_overlap)
# Apply the Gaussian kernel to the center coordinates. Returned heatmap # Apply the Gaussian kernel to the center coordinates. Returned heatmap
# has shape of [out_height, out_width, num_classes] # has shape of [out_height, out_width, num_classes]
heatmap = ta_utils.coordinates_to_heatmap(
y_grid=y_grid, if self._box_heatmap_type == 'adaptive_gaussian':
x_grid=x_grid, heatmap = ta_utils.coordinates_to_heatmap(
y_coordinates=y_center, y_grid=y_grid,
x_coordinates=x_center, x_grid=x_grid,
sigma=sigma, y_coordinates=y_center,
channel_onehot=class_targets, x_coordinates=x_center,
channel_weights=weights, sigma=sigma,
sparse=self._compute_heatmap_sparse) channel_onehot=class_targets,
channel_weights=weights,
sparse=self._compute_heatmap_sparse)
elif self._box_heatmap_type == 'iou':
heatmap = ta_utils.coordinates_to_iou(y_grid, x_grid, boxes,
class_targets, weights)
else:
raise ValueError(f'Unknown heatmap type - {self._box_heatmap_type}')
heatmaps.append(heatmap) heatmaps.append(heatmap)
# Return the stacked heatmaps over the batch. # Return the stacked heatmaps over the batch.
return tf.stack(heatmaps, axis=0) stacked_heatmaps = tf.stack(heatmaps, axis=0)
return (tf.pow(stacked_heatmaps, self._heatmap_exponent) if
self._heatmap_exponent != 1.0 else stacked_heatmaps)
def assign_center_targets_from_keypoints(self, def assign_center_targets_from_keypoints(self,
height, height,
......
...@@ -1678,6 +1678,66 @@ class CenterNetBoxTargetAssignerTest(test_case.TestCase): ...@@ -1678,6 +1678,66 @@ class CenterNetBoxTargetAssignerTest(test_case.TestCase):
np.testing.assert_array_equal(preds, [[1, 2], [3, 4], [5, 6], [7, 8]]) np.testing.assert_array_equal(preds, [[1, 2], [3, 4], [5, 6], [7, 8]])
class CenterNetIOUTargetAssignerTest(test_case.TestCase):
def setUp(self):
super(CenterNetIOUTargetAssignerTest, self).setUp()
self._box_center = [0.0, 0.0, 1.0, 1.0]
self._box_center_small = [0.25, 0.25, 0.75, 0.75]
self._box_lower_left = [0.5, 0.0, 1.0, 0.5]
self._box_center_offset = [0.1, 0.05, 1.0, 1.0]
self._box_odd_coordinates = [0.1625, 0.2125, 0.5625, 0.9625]
def test_center_location(self):
"""Test that the centers are at the correct location."""
def graph_fn():
box_batch = [tf.constant([self._box_center, self._box_lower_left]),
tf.constant([self._box_lower_left, self._box_center])]
classes = [
tf.one_hot([0, 1], depth=4),
tf.one_hot([2, 2], depth=4)
]
assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
4, box_heatmap_type='iou')
targets = assigner.assign_center_targets_from_boxes(
80, 80, box_batch, classes)
return targets
targets = self.execute(graph_fn, [])
self.assertEqual((10, 10), _array_argmax(targets[0, :, :, 0]))
self.assertAlmostEqual(1.0, targets[0, 10, 10, 0])
self.assertEqual((15, 5), _array_argmax(targets[0, :, :, 1]))
self.assertAlmostEqual(1.0, targets[0, 15, 5, 1])
self.assertAlmostEqual(1.0, targets[1, 15, 5, 2])
self.assertAlmostEqual(1.0, targets[1, 10, 10, 2])
self.assertAlmostEqual(0.0, targets[1, 0, 19, 1])
def test_exponent(self):
"""Test that the centers are at the correct location."""
def graph_fn():
box_batch = [tf.constant([self._box_center, self._box_lower_left]),
tf.constant([self._box_lower_left, self._box_center])]
classes = [
tf.one_hot([0], depth=2),
]
assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
1, box_heatmap_type='iou')
targets = assigner.assign_center_targets_from_boxes(
4, 4, box_batch, classes)
assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
1, box_heatmap_type='iou', heatmap_exponent=0.5)
targets_pow = assigner.assign_center_targets_from_boxes(
4, 4, box_batch, classes)
return targets, targets_pow
targets, targets_pow = self.execute(graph_fn, [])
self.assertLess(targets[0, 2, 3, 0], 1.0)
self.assertLess(targets_pow[0, 2, 3, 0], 1.0)
self.assertAlmostEqual(targets[0, 2, 3, 0], targets_pow[0, 2, 3, 0] ** 2)
class CenterNetKeypointTargetAssignerTest(test_case.TestCase): class CenterNetKeypointTargetAssignerTest(test_case.TestCase):
def test_keypoint_heatmap_targets(self): def test_keypoint_heatmap_targets(self):
......
...@@ -10,12 +10,12 @@ devices. It enables on-device machine learning inference with low latency and a ...@@ -10,12 +10,12 @@ devices. It enables on-device machine learning inference with low latency and a
small binary size. TensorFlow Lite uses many techniques for this such as small binary size. TensorFlow Lite uses many techniques for this such as
quantized kernels that allow smaller and faster (fixed-point math) models. quantized kernels that allow smaller and faster (fixed-point math) models.
For this section, you will need to build [TensorFlow from For this section, you will need to build
source](https://www.tensorflow.org/install/install_sources) to get the [TensorFlow from source](https://www.tensorflow.org/install/install_sources) to
TensorFlow Lite support for the SSD model. At this time only SSD models are supported. get the TensorFlow Lite support for the SSD model. At this time only SSD models
Models like faster_rcnn are not supported at this time. You will also need to install the are supported. Models like faster_rcnn are not supported at this time. You will
[bazel build also need to install the
tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel). [bazel build tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel).
To make these commands easier to run, let’s set up some environment variables: To make these commands easier to run, let’s set up some environment variables:
...@@ -96,7 +96,17 @@ bazel run -c opt tensorflow/lite/python:tflite_convert -- \ ...@@ -96,7 +96,17 @@ bazel run -c opt tensorflow/lite/python:tflite_convert -- \
--allow_custom_ops --allow_custom_ops
``` ```
# Running our model on Android ## Adding Metadata to the model
To make it easier to use tflite models on mobile, you will need to add
[metadata](https://www.tensorflow.org/lite/convert/metadata) to your model and
also
[pack](https://www.tensorflow.org/lite/convert/metadata#pack_metadata_and_associated_files_into_the_model)
the associated labels file to it.
If you need more information, this process is also explained in the
[Metadata writer Object detectors documentation](https://www.tensorflow.org/lite/convert/metadata_writer_tutorial#object_detectors)
## Running our model on Android
To run our TensorFlow Lite model on device, we will use Android Studio to build To run our TensorFlow Lite model on device, we will use Android Studio to build
and run the TensorFlow Lite detection example with the new model. The example is and run the TensorFlow Lite detection example with the new model. The example is
...@@ -119,8 +129,8 @@ cp /tmp/tflite/detect.tflite \ ...@@ -119,8 +129,8 @@ cp /tmp/tflite/detect.tflite \
$TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets
``` ```
You will also need to copy your new labelmap labelmap.txt to the assets It's important to notice that the labels file should be packed in the model (as
directory. mentioned previously)
We will now edit the gradle build file to use these assets. First, open the We will now edit the gradle build file to use these assets. First, open the
`build.gradle` file `build.gradle` file
...@@ -128,17 +138,15 @@ We will now edit the gradle build file to use these assets. First, open the ...@@ -128,17 +138,15 @@ We will now edit the gradle build file to use these assets. First, open the
out the model download script to avoid your assets being overwritten: `// apply out the model download script to avoid your assets being overwritten: `// apply
from:'download_model.gradle'` ``` from:'download_model.gradle'` ```
If your model is named `detect.tflite`, and your labels file `labelmap.txt`, the If your model is named `detect.tflite`, the example will use it automatically as
example will use them automatically as long as they've been properly copied into long as they've been properly copied into the base assets directory. If you need
the base assets directory. If you need to use a custom path or filename, open up to use a custom path or filename, open up the
the
$TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
file in a text editor and find the definition of TF_OD_API_LABELS_FILE. Update file in a text editor and find the definition of TF_OD_API_MODEL_FILE. Note that
this path to point to your new label map file: if your model is quantized, the flag TF_OD_API_IS_QUANTIZED is set to true, and
"labels_list.txt". Note that if your model is quantized, if your model is floating point, the flag TF_OD_API_IS_QUANTIZED is set to
the flag TF_OD_API_IS_QUANTIZED is set to true, and if your model is floating false. This new section of DetectorActivity.java should now look as follows for
point, the flag TF_OD_API_IS_QUANTIZED is set to false. This new section of a quantized model:
DetectorActivity.java should now look as follows for a quantized model:
```shell ```shell
private static final boolean TF_OD_API_IS_QUANTIZED = true; private static final boolean TF_OD_API_IS_QUANTIZED = true;
......
...@@ -92,27 +92,15 @@ converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8, ...@@ -92,27 +92,15 @@ converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8,
converter.representative_dataset = <...> converter.representative_dataset = <...>
``` ```
### Step 3: Add Metadata ### Step 3: add Metadata to the model
The model needs to be packed with To make it easier to use tflite models on mobile, you will need to add
[TFLite Metadata](https://www.tensorflow.org/lite/convert/metadata) to enable [metadata](https://www.tensorflow.org/lite/convert/metadata) to your model and
easy integration into mobile apps using the also
[TFLite Task Library](https://www.tensorflow.org/lite/inference_with_metadata/task_library/object_detector). [pack](https://www.tensorflow.org/lite/convert/metadata#pack_metadata_and_associated_files_into_the_model)
This metadata helps the inference code perform the correct pre & post processing the associated labels file to it.
as required by the model. Use the following code to create the metadata. If you need more information, This process is also explained in the
[Image classification sample](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/metadata)
```python
from tflite_support.metadata_writers import object_detector
from tflite_support.metadata_writers import writer_utils
writer = object_detector.MetadataWriter.create_for_inference(
writer_utils.load_file(_TFLITE_MODEL_PATH), input_norm_mean=[0],
input_norm_std=[255], label_file_paths=[_TFLITE_LABEL_PATH])
writer_utils.save_file(writer.populate(), _TFLITE_MODEL_WITH_METADATA_PATH)
```
See the TFLite Metadata Writer API [documentation](https://www.tensorflow.org/lite/convert/metadata_writer_tutorial#object_detectors)
for more details.
## Running our model on Android ## Running our model on Android
...@@ -142,9 +130,9 @@ the ...@@ -142,9 +130,9 @@ the
that support API >= 21. Additional details are available on the that support API >= 21. Additional details are available on the
[TensorFlow Lite example page](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android). [TensorFlow Lite example page](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android).
Next we need to point the app to our new detect.tflite file and give it the Next we need to point the app to our new detect.tflite file . Specifically, we
names of our new labels. Specifically, we will copy our TensorFlow Lite will copy our TensorFlow Lite flatbuffer to the app assets directory with the
model with metadata to the app assets directory with the following command: following command:
```shell ```shell
mkdir $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets mkdir $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets
...@@ -152,21 +140,30 @@ cp /tmp/tflite/detect.tflite \ ...@@ -152,21 +140,30 @@ cp /tmp/tflite/detect.tflite \
$TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets
``` ```
It's important to notice that the labels file should be packed in the model (as
mentioned on Step 3)
We will now edit the gradle build file to use these assets. First, open the We will now edit the gradle build file to use these assets. First, open the
`build.gradle` file `build.gradle` file
`$TF_EXAMPLES/lite/examples/object_detection/android/app/build.gradle`. Comment `$TF_EXAMPLES/lite/examples/object_detection/android/app/build.gradle`. Comment
out the model download script to avoid your assets being overwritten: out the model download script to avoid your assets being overwritten: `// apply
from:'download_model.gradle'` ```
```shell
// apply from:'download_model.gradle'
```
If your model is named `detect.tflite`, the example will use it automatically as If your model is named `detect.tflite`, the example will use it automatically as
long as they've been properly copied into the base assets directory. If you need long as they've been properly copied into the base assets directory. If you need
to use a custom path or filename, open up the to use a custom path or filename, open up the
$TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
file in a text editor and find the definition of TF_OD_API_MODEL_FILE. Update file in a text editor and find the definition of TF_OD_API_MODEL_FILE. Note that
this path to point to your new model file. if your model is quantized, the flag TF_OD_API_IS_QUANTIZED is set to true, and
if your model is floating point, the flag TF_OD_API_IS_QUANTIZED is set to
false. This new section of DetectorActivity.java should now look as follows for
a quantized model:
```shell
private static final boolean TF_OD_API_IS_QUANTIZED = true;
private static final String TF_OD_API_MODEL_FILE = "detect.tflite";
private static final String TF_OD_API_LABELS_FILE = "labels_list.txt";
```
Once you’ve copied the TensorFlow Lite model and edited the gradle build script Once you’ve copied the TensorFlow Lite model and edited the gradle build script
to not use the downloaded assets, you can build and deploy the app using the to not use the downloaded assets, you can build and deploy the app using the
......
...@@ -668,7 +668,8 @@ def _get_labels_dict(input_dict): ...@@ -668,7 +668,8 @@ def _get_labels_dict(input_dict):
fields.InputDataFields.groundtruth_dp_surface_coords, fields.InputDataFields.groundtruth_dp_surface_coords,
fields.InputDataFields.groundtruth_track_ids, fields.InputDataFields.groundtruth_track_ids,
fields.InputDataFields.groundtruth_verified_neg_classes, fields.InputDataFields.groundtruth_verified_neg_classes,
fields.InputDataFields.groundtruth_not_exhaustive_classes fields.InputDataFields.groundtruth_not_exhaustive_classes,
fields.InputDataFields.groundtruth_image_classes,
] ]
for key in optional_label_keys: for key in optional_label_keys:
......
...@@ -12,12 +12,12 @@ import tensorflow as tf ...@@ -12,12 +12,12 @@ import tensorflow as tf
from object_detection.builders import losses_builder from object_detection.builders import losses_builder
from object_detection.core import box_list from object_detection.core import box_list
from object_detection.core import box_list_ops from object_detection.core import box_list_ops
from object_detection.core import losses
from object_detection.core import preprocessor from object_detection.core import preprocessor
from object_detection.core import standard_fields as fields from object_detection.core import standard_fields as fields
from object_detection.meta_architectures import center_net_meta_arch from object_detection.meta_architectures import center_net_meta_arch
from object_detection.models.keras_models import hourglass_network from object_detection.models.keras_models import hourglass_network
from object_detection.models.keras_models import resnet_v1 from object_detection.models.keras_models import resnet_v1
from object_detection.protos import center_net_pb2
from object_detection.protos import losses_pb2 from object_detection.protos import losses_pb2
from object_detection.protos import preprocessor_pb2 from object_detection.protos import preprocessor_pb2
from object_detection.utils import shape_utils from object_detection.utils import shape_utils
...@@ -38,46 +38,26 @@ NEIGHBORS_2D = [[-1, -1], [-1, 0], [-1, 1], ...@@ -38,46 +38,26 @@ NEIGHBORS_2D = [[-1, -1], [-1, 0], [-1, 1],
[0, -1], [0, 1], [0, -1], [0, 1],
[1, -1], [1, 0], [1, 1]] [1, -1], [1, 0], [1, 1]]
WEAK_LOSSES = [DEEP_MASK_BOX_CONSISTENCY, DEEP_MASK_COLOR_CONSISTENCY] WEAK_LOSSES = [DEEP_MASK_BOX_CONSISTENCY, DEEP_MASK_COLOR_CONSISTENCY]
MASK_LOSSES = WEAK_LOSSES + [DEEP_MASK_ESTIMATION]
class DeepMACParams( DeepMACParams = collections.namedtuple('DeepMACParams', [
collections.namedtuple('DeepMACParams', [
'classification_loss', 'dim', 'task_loss_weight', 'pixel_embedding_dim', 'classification_loss', 'dim', 'task_loss_weight', 'pixel_embedding_dim',
'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples', 'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples',
'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels', 'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels',
'predict_full_resolution_masks', 'postprocess_crop_size', 'predict_full_resolution_masks', 'postprocess_crop_size',
'max_roi_jitter_ratio', 'roi_jitter_mode', 'max_roi_jitter_ratio', 'roi_jitter_mode',
'box_consistency_loss_weight', 'color_consistency_threshold', 'box_consistency_loss_weight', 'color_consistency_threshold',
'color_consistency_dilation', 'color_consistency_loss_weight' 'color_consistency_dilation', 'color_consistency_loss_weight',
])): 'box_consistency_loss_normalize', 'box_consistency_tightness',
"""Class holding the DeepMAC network configutration.""" 'color_consistency_warmup_steps', 'color_consistency_warmup_start'
])
__slots__ = ()
def __new__(cls, classification_loss, dim, task_loss_weight, def _get_loss_weight(loss_name, config):
pixel_embedding_dim, allowed_masked_classes_ids, mask_size, if loss_name == DEEP_MASK_ESTIMATION:
mask_num_subsamples, use_xy, network_type, use_instance_embedding, return config.task_loss_weight
num_init_channels, predict_full_resolution_masks, elif loss_name == DEEP_MASK_COLOR_CONSISTENCY:
postprocess_crop_size, max_roi_jitter_ratio,
roi_jitter_mode, box_consistency_loss_weight,
color_consistency_threshold, color_consistency_dilation,
color_consistency_loss_weight):
return super(DeepMACParams,
cls).__new__(cls, classification_loss, dim,
task_loss_weight, pixel_embedding_dim,
allowed_masked_classes_ids, mask_size,
mask_num_subsamples, use_xy, network_type,
use_instance_embedding, num_init_channels,
predict_full_resolution_masks,
postprocess_crop_size, max_roi_jitter_ratio,
roi_jitter_mode, box_consistency_loss_weight,
color_consistency_threshold,
color_consistency_dilation,
color_consistency_loss_weight)
def _get_weak_loss_weight(loss_name, config):
if loss_name == DEEP_MASK_COLOR_CONSISTENCY:
return config.color_consistency_loss_weight return config.color_consistency_loss_weight
elif loss_name == DEEP_MASK_BOX_CONSISTENCY: elif loss_name == DEEP_MASK_BOX_CONSISTENCY:
return config.box_consistency_loss_weight return config.box_consistency_loss_weight
...@@ -151,7 +131,7 @@ def _get_deepmac_network_by_type(name, num_init_channels, mask_size=None): ...@@ -151,7 +131,7 @@ def _get_deepmac_network_by_type(name, num_init_channels, mask_size=None):
raise ValueError('Mask size must be set.') raise ValueError('Mask size must be set.')
return FullyConnectedMaskHead(num_init_channels, mask_size) return FullyConnectedMaskHead(num_init_channels, mask_size)
elif name == 'embedding_projection': elif _is_mask_head_param_free(name):
return tf.keras.layers.Lambda(lambda x: x) return tf.keras.layers.Lambda(lambda x: x)
elif name.startswith('resnet'): elif name.startswith('resnet'):
...@@ -395,6 +375,94 @@ def dilated_cross_same_mask_label(instance_masks, dilation=2): ...@@ -395,6 +375,94 @@ def dilated_cross_same_mask_label(instance_masks, dilation=2):
return tf.transpose(same_mask_prob, (0, 3, 1, 2)) return tf.transpose(same_mask_prob, (0, 3, 1, 2))
def _per_pixel_single_conv(input_tensor, params, channels):
"""Convolve the given input with the given params.
Args:
input_tensor: A [num_instances, height, width, channels] shaped
float tensor.
params: A [num_instances, num_params] shaped float tensor.
channels: int, number of channels in the convolution.
Returns:
output: A float tensor of shape [num_instances, height, width, channels]
"""
input_channels = input_tensor.get_shape().as_list()[3]
weights = params[:, :(input_channels * channels)]
biases = params[:, (input_channels * channels):]
num_instances = tf.shape(params)[0]
weights = tf.reshape(weights, (num_instances, input_channels, channels))
output = (input_tensor[:, :, tf.newaxis, :] @
weights[:, tf.newaxis, tf.newaxis, :, :])
output = output[:, :, 0, :, :]
output = output + biases[:, tf.newaxis, tf.newaxis, :]
return output
def per_pixel_conditional_conv(input_tensor, parameters, channels, depth):
"""Use parameters perform per-pixel convolutions with the given depth [1].
[1]: https://arxiv.org/abs/2003.05664
Args:
input_tensor: float tensor of shape [num_instances, height,
width, input_channels]
parameters: A [num_instances, num_params] float tensor. If num_params
is incomparible with the given channels and depth, a ValueError will
be raised.
channels: int, the number of channels in the convolution.
depth: int, the number of layers of convolutions to perform.
Returns:
output: A [num_instances, height, width] tensor with the conditional
conv applied according to each instance's parameters.
"""
input_channels = input_tensor.get_shape().as_list()[3]
num_params = parameters.get_shape().as_list()[1]
input_convs = 1 if depth > 1 else 0
intermediate_convs = depth - 2 if depth >= 2 else 0
expected_weights = ((input_channels * channels * input_convs) +
(channels * channels * intermediate_convs) +
channels) # final conv
expected_biases = (channels * (depth - 1)) + 1
if depth == 1:
if input_channels != channels:
raise ValueError(
'When depth=1, input_channels({}) should be equal to'.format(
input_channels) + ' channels({})'.format(channels))
if num_params != (expected_weights + expected_biases):
raise ValueError('Expected {} parameters at depth {}, but got {}'.format(
expected_weights + expected_biases, depth, num_params))
start = 0
output = input_tensor
for i in range(depth):
is_last_layer = i == (depth - 1)
if is_last_layer:
channels = 1
num_params_single_conv = channels * input_channels + channels
params = parameters[:, start:start + num_params_single_conv]
start += num_params_single_conv
output = _per_pixel_single_conv(output, params, channels)
if not is_last_layer:
output = tf.nn.relu(output)
input_channels = channels
return output
class ResNetMaskNetwork(tf.keras.layers.Layer): class ResNetMaskNetwork(tf.keras.layers.Layer):
"""A small wrapper around ResNet blocks to predict masks.""" """A small wrapper around ResNet blocks to predict masks."""
...@@ -560,6 +628,16 @@ class DenseResNet(tf.keras.layers.Layer): ...@@ -560,6 +628,16 @@ class DenseResNet(tf.keras.layers.Layer):
return self.out_conv(self.resnet(net)) return self.out_conv(self.resnet(net))
def _is_mask_head_param_free(name):
# Mask heads which don't have parameters of their own and instead rely
# on the instance embedding.
if name == 'embedding_projection' or name.startswith('cond_inst'):
return True
return False
class MaskHeadNetwork(tf.keras.layers.Layer): class MaskHeadNetwork(tf.keras.layers.Layer):
"""Mask head class for DeepMAC.""" """Mask head class for DeepMAC."""
...@@ -586,13 +664,14 @@ class MaskHeadNetwork(tf.keras.layers.Layer): ...@@ -586,13 +664,14 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
self._use_instance_embedding = use_instance_embedding self._use_instance_embedding = use_instance_embedding
self._network_type = network_type self._network_type = network_type
self._num_init_channels = num_init_channels
if (self._use_instance_embedding and if (self._use_instance_embedding and
(self._network_type == 'embedding_projection')): (_is_mask_head_param_free(network_type))):
raise ValueError(('Cannot feed instance embedding to mask head when ' raise ValueError(('Cannot feed instance embedding to mask head when '
'computing embedding projection.')) 'mask-head has no parameters.'))
if network_type == 'embedding_projection': if _is_mask_head_param_free(network_type):
self.project_out = tf.keras.layers.Lambda(lambda x: x) self.project_out = tf.keras.layers.Lambda(lambda x: x)
else: else:
self.project_out = tf.keras.layers.Conv2D( self.project_out = tf.keras.layers.Conv2D(
...@@ -632,6 +711,11 @@ class MaskHeadNetwork(tf.keras.layers.Layer): ...@@ -632,6 +711,11 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :] instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
out = embedding_projection(instance_embedding, out) out = embedding_projection(instance_embedding, out)
elif self._network_type.startswith('cond_inst'):
depth = int(self._network_type.lstrip('cond_inst'))
out = per_pixel_conditional_conv(out, instance_embedding,
self._num_init_channels, depth)
if out.shape[-1] > 1: if out.shape[-1] > 1:
out = self.project_out(out) out = self.project_out(out)
...@@ -651,6 +735,9 @@ def deepmac_proto_to_params(deepmac_config): ...@@ -651,6 +735,9 @@ def deepmac_proto_to_params(deepmac_config):
jitter_mode = preprocessor_pb2.RandomJitterBoxes.JitterMode.Name( jitter_mode = preprocessor_pb2.RandomJitterBoxes.JitterMode.Name(
deepmac_config.jitter_mode).lower() deepmac_config.jitter_mode).lower()
box_consistency_loss_normalize = center_net_pb2.LossNormalize.Name(
deepmac_config.box_consistency_loss_normalize).lower()
return DeepMACParams( return DeepMACParams(
dim=deepmac_config.dim, dim=deepmac_config.dim,
classification_loss=classification_loss, classification_loss=classification_loss,
...@@ -671,7 +758,14 @@ def deepmac_proto_to_params(deepmac_config): ...@@ -671,7 +758,14 @@ def deepmac_proto_to_params(deepmac_config):
box_consistency_loss_weight=deepmac_config.box_consistency_loss_weight, box_consistency_loss_weight=deepmac_config.box_consistency_loss_weight,
color_consistency_threshold=deepmac_config.color_consistency_threshold, color_consistency_threshold=deepmac_config.color_consistency_threshold,
color_consistency_dilation=deepmac_config.color_consistency_dilation, color_consistency_dilation=deepmac_config.color_consistency_dilation,
color_consistency_loss_weight=deepmac_config.color_consistency_loss_weight color_consistency_loss_weight=
deepmac_config.color_consistency_loss_weight,
box_consistency_loss_normalize=box_consistency_loss_normalize,
box_consistency_tightness=deepmac_config.box_consistency_tightness,
color_consistency_warmup_steps=
deepmac_config.color_consistency_warmup_steps,
color_consistency_warmup_start=
deepmac_config.color_consistency_warmup_start
) )
...@@ -868,6 +962,60 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch): ...@@ -868,6 +962,60 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
return resize_instance_masks(logits, (height, width)) return resize_instance_masks(logits, (height, width))
def _aggregate_classification_loss(self, loss, gt, pred, method):
"""Aggregates loss at a per-instance level.
When this function is used with mask-heads, num_classes is usually 1.
Args:
loss: A [num_instances, num_pixels, num_classes] or
[num_instances, num_classes] tensor. If the tensor is of rank 2, i.e.,
of the form [num_instances, num_classes], we will assume that the
number of pixels have already been nornalized.
gt: A [num_instances, num_pixels, num_classes] float tensor of
groundtruths.
pred: A [num_instances, num_pixels, num_classes] float tensor of
preditions.
method: A string in ['auto', 'groundtruth'].
'auto': When `loss` is rank 2, aggregates by sum. Otherwise, aggregates
by mean.
'groundtruth_count': Aggreagates the loss by computing sum and dividing
by the number of positive (1) groundtruth pixels.
'balanced': Normalizes each pixel by the number of positive or negative
pixels depending on the groundtruth.
Returns:
per_instance_loss: A [num_instances] float tensor.
"""
rank = len(loss.get_shape().as_list())
if rank == 2:
axes = [1]
else:
axes = [1, 2]
if method == 'normalize_auto':
normalization = 1.0
if rank == 2:
return tf.reduce_sum(loss, axis=axes)
else:
return tf.reduce_mean(loss, axis=axes)
elif method == 'normalize_groundtruth_count':
normalization = tf.reduce_sum(gt, axis=axes)
return tf.reduce_sum(loss, axis=axes) / normalization
elif method == 'normalize_balanced':
if rank != 3:
raise ValueError('Cannot apply normalized_balanced aggregation '
f'to loss of rank {rank}')
normalization = (
(gt * tf.reduce_sum(gt, keepdims=True, axis=axes)) +
(1 - gt) * tf.reduce_sum(1 - gt, keepdims=True, axis=axes))
return tf.reduce_sum(loss / normalization, axis=axes)
else:
raise ValueError('Unknown loss aggregation - {}'.format(method))
def _compute_per_instance_mask_prediction_loss( def _compute_per_instance_mask_prediction_loss(
self, boxes, mask_logits, mask_gt): self, boxes, mask_logits, mask_gt):
"""Compute the per-instance mask loss. """Compute the per-instance mask loss.
...@@ -891,14 +1039,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch): ...@@ -891,14 +1039,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
target_tensor=mask_gt, target_tensor=mask_gt,
weights=tf.ones_like(mask_logits)) weights=tf.ones_like(mask_logits))
# TODO(vighneshb) Make this configurable via config. return self._aggregate_classification_loss(
# Skip normalization for dice loss because the denominator term already loss, mask_gt, mask_logits, 'normalize_auto')
# does normalization.
if isinstance(self._deepmac_params.classification_loss,
losses.WeightedDiceClassificationLoss):
return tf.reduce_sum(loss, axis=1)
else:
return tf.reduce_mean(loss, axis=[1, 2])
def _compute_per_instance_box_consistency_loss( def _compute_per_instance_box_consistency_loss(
self, boxes_gt, boxes_for_crop, mask_logits): self, boxes_gt, boxes_for_crop, mask_logits):
...@@ -930,23 +1072,30 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch): ...@@ -930,23 +1072,30 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
loss = 0.0 loss = 0.0
for axis in [1, 2]: for axis in [1, 2]:
pred_max = tf.reduce_max(pred_crop, axis=axis)[:, :, tf.newaxis]
if self._deepmac_params.box_consistency_tightness:
pred_max_raw = tf.reduce_max(pred_crop, axis=axis)
pred_max_within_box = tf.reduce_max(pred_crop * gt_crop, axis=axis)
box_1d = tf.reduce_max(gt_crop, axis=axis)
pred_max = ((box_1d * pred_max_within_box) +
((1 - box_1d) * pred_max_raw))
else:
pred_max = tf.reduce_max(pred_crop, axis=axis)
pred_max = pred_max[:, :, tf.newaxis]
gt_max = tf.reduce_max(gt_crop, axis=axis)[:, :, tf.newaxis] gt_max = tf.reduce_max(gt_crop, axis=axis)[:, :, tf.newaxis]
axis_loss = self._deepmac_params.classification_loss( raw_loss = self._deepmac_params.classification_loss(
prediction_tensor=pred_max, prediction_tensor=pred_max,
target_tensor=gt_max, target_tensor=gt_max,
weights=tf.ones_like(pred_max)) weights=tf.ones_like(pred_max))
loss += axis_loss
loss += self._aggregate_classification_loss(
# Skip normalization for dice loss because the denominator term already raw_loss, gt_max, pred_max,
# does normalization. self._deepmac_params.box_consistency_loss_normalize)
# TODO(vighneshb) Make this configurable via config.
if isinstance(self._deepmac_params.classification_loss, return loss
losses.WeightedDiceClassificationLoss):
return tf.reduce_sum(loss, axis=1)
else:
return tf.reduce_mean(loss, axis=[1, 2])
def _compute_per_instance_color_consistency_loss( def _compute_per_instance_color_consistency_loss(
self, boxes, preprocessed_image, mask_logits): self, boxes, preprocessed_image, mask_logits):
...@@ -995,6 +1144,17 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch): ...@@ -995,6 +1144,17 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
num_box_pixels = tf.maximum(1.0, tf.reduce_sum(box_mask, axis=[1, 2])) num_box_pixels = tf.maximum(1.0, tf.reduce_sum(box_mask, axis=[1, 2]))
loss = loss / num_box_pixels loss = loss / num_box_pixels
if ((self._deepmac_params.color_consistency_warmup_steps > 0) and
self._is_training):
training_step = tf.cast(self.training_step, tf.float32)
warmup_steps = tf.cast(
self._deepmac_params.color_consistency_warmup_steps, tf.float32)
start_step = tf.cast(
self._deepmac_params.color_consistency_warmup_start, tf.float32)
warmup_weight = (training_step - start_step) / warmup_steps
warmup_weight = tf.clip_by_value(warmup_weight, 0.0, 1.0)
loss *= warmup_weight
return loss return loss
def _compute_per_instance_deepmac_losses( def _compute_per_instance_deepmac_losses(
...@@ -1084,11 +1244,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch): ...@@ -1084,11 +1244,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
allowed_masked_classes_ids = ( allowed_masked_classes_ids = (
self._deepmac_params.allowed_masked_classes_ids) self._deepmac_params.allowed_masked_classes_ids)
loss_dict = { loss_dict = {}
DEEP_MASK_ESTIMATION: 0.0, for loss_name in MASK_LOSSES:
}
for loss_name in WEAK_LOSSES:
loss_dict[loss_name] = 0.0 loss_dict[loss_name] = 0.0
prediction_shape = tf.shape(prediction_dict[INSTANCE_EMBEDDING][0]) prediction_shape = tf.shape(prediction_dict[INSTANCE_EMBEDDING][0])
...@@ -1148,13 +1305,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch): ...@@ -1148,13 +1305,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
mask_loss_dict = self._compute_instance_masks_loss( mask_loss_dict = self._compute_instance_masks_loss(
prediction_dict=prediction_dict) prediction_dict=prediction_dict)
losses_dict[LOSS_KEY_PREFIX + '/' + DEEP_MASK_ESTIMATION] = ( for loss_name in MASK_LOSSES:
self._deepmac_params.task_loss_weight * mask_loss_dict[ loss_weight = _get_loss_weight(loss_name, self._deepmac_params)
DEEP_MASK_ESTIMATION]
)
for loss_name in WEAK_LOSSES:
loss_weight = _get_weak_loss_weight(loss_name, self._deepmac_params)
if loss_weight > 0.0: if loss_weight > 0.0:
losses_dict[LOSS_KEY_PREFIX + '/' + loss_name] = ( losses_dict[LOSS_KEY_PREFIX + '/' + loss_name] = (
loss_weight * mask_loss_dict[loss_name]) loss_weight * mask_loss_dict[loss_name])
......
"""Tests for google3.third_party.tensorflow_models.object_detection.meta_architectures.deepmac_meta_arch.""" """Tests for google3.third_party.tensorflow_models.object_detection.meta_architectures.deepmac_meta_arch."""
import functools import functools
import random
import unittest import unittest
from absl.testing import parameterized from absl.testing import parameterized
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from google.protobuf import text_format
from object_detection.core import losses from object_detection.core import losses
from object_detection.core import preprocessor from object_detection.core import preprocessor
from object_detection.meta_architectures import center_net_meta_arch from object_detection.meta_architectures import center_net_meta_arch
from object_detection.meta_architectures import deepmac_meta_arch from object_detection.meta_architectures import deepmac_meta_arch
from object_detection.protos import center_net_pb2
from object_detection.utils import tf_version from object_detection.utils import tf_version
DEEPMAC_PROTO_TEXT = """
dim: 153
task_loss_weight: 5.0
pixel_embedding_dim: 8
use_xy: false
use_instance_embedding: false
network_type: "cond_inst3"
num_init_channels: 8
classification_loss {
weighted_dice_classification_loss {
squared_normalization: false
is_prediction_probability: false
}
}
jitter_mode: EXPAND_SYMMETRIC_XY
max_roi_jitter_ratio: 0.0
predict_full_resolution_masks: true
allowed_masked_classes_ids: [99]
box_consistency_loss_weight: 1.0
color_consistency_loss_weight: 1.0
color_consistency_threshold: 0.1
box_consistency_tightness: false
box_consistency_loss_normalize: NORMALIZE_AUTO
color_consistency_warmup_steps: 20
color_consistency_warmup_start: 10
"""
class DummyFeatureExtractor(center_net_meta_arch.CenterNetFeatureExtractor): class DummyFeatureExtractor(center_net_meta_arch.CenterNetFeatureExtractor):
def __init__(self, def __init__(self,
...@@ -60,14 +93,37 @@ class MockMaskNet(tf.keras.layers.Layer): ...@@ -60,14 +93,37 @@ class MockMaskNet(tf.keras.layers.Layer):
return tf.zeros_like(pixel_embedding[:, :, :, 0]) + 0.9 return tf.zeros_like(pixel_embedding[:, :, :, 0]) + 0.9
def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False, def build_meta_arch(**override_params):
use_instance_embedding=True, mask_num_subsamples=-1,
network_type='hourglass10', use_xy=True,
pixel_embedding_dim=2,
dice_loss_prediction_probability=False,
color_consistency_threshold=0.5):
"""Builds the DeepMAC meta architecture.""" """Builds the DeepMAC meta architecture."""
params = dict(
predict_full_resolution_masks=False,
use_instance_embedding=True,
mask_num_subsamples=-1,
network_type='hourglass10',
use_xy=True,
pixel_embedding_dim=2,
dice_loss_prediction_probability=False,
color_consistency_threshold=0.5,
use_dice_loss=False,
box_consistency_loss_normalize='normalize_auto',
box_consistency_tightness=False,
task_loss_weight=1.0,
color_consistency_loss_weight=1.0,
box_consistency_loss_weight=1.0,
num_init_channels=8,
dim=8,
allowed_masked_classes_ids=[],
mask_size=16,
postprocess_crop_size=128,
max_roi_jitter_ratio=0.0,
roi_jitter_mode='random',
color_consistency_dilation=2,
color_consistency_warmup_steps=0,
color_consistency_warmup_start=0)
params.update(override_params)
feature_extractor = DummyFeatureExtractor( feature_extractor = DummyFeatureExtractor(
channel_means=(1.0, 2.0, 3.0), channel_means=(1.0, 2.0, 3.0),
channel_stds=(10., 20., 30.), channel_stds=(10., 20., 30.),
...@@ -87,33 +143,18 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False, ...@@ -87,33 +143,18 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
max_box_predictions=5, max_box_predictions=5,
use_labeled_classes=False) use_labeled_classes=False)
use_dice_loss = params.pop('use_dice_loss')
dice_loss_prediction_prob = params.pop('dice_loss_prediction_probability')
if use_dice_loss: if use_dice_loss:
classification_loss = losses.WeightedDiceClassificationLoss( classification_loss = losses.WeightedDiceClassificationLoss(
squared_normalization=False, squared_normalization=False,
is_prediction_probability=dice_loss_prediction_probability) is_prediction_probability=dice_loss_prediction_prob)
else: else:
classification_loss = losses.WeightedSigmoidClassificationLoss() classification_loss = losses.WeightedSigmoidClassificationLoss()
deepmac_params = deepmac_meta_arch.DeepMACParams( deepmac_params = deepmac_meta_arch.DeepMACParams(
classification_loss=classification_loss, classification_loss=classification_loss,
dim=8, **params
task_loss_weight=1.0,
pixel_embedding_dim=pixel_embedding_dim,
allowed_masked_classes_ids=[],
mask_size=16,
mask_num_subsamples=mask_num_subsamples,
use_xy=use_xy,
network_type=network_type,
use_instance_embedding=use_instance_embedding,
num_init_channels=8,
predict_full_resolution_masks=predict_full_resolution_masks,
postprocess_crop_size=128,
max_roi_jitter_ratio=0.0,
roi_jitter_mode='random',
box_consistency_loss_weight=1.0,
color_consistency_threshold=color_consistency_threshold,
color_consistency_dilation=2,
color_consistency_loss_weight=1.0
) )
object_detection_params = center_net_meta_arch.ObjectDetectionParams( object_detection_params = center_net_meta_arch.ObjectDetectionParams(
...@@ -136,6 +177,15 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False, ...@@ -136,6 +177,15 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.') @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase): class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
def test_proto_parse(self):
proto = center_net_pb2.CenterNet().DeepMACMaskEstimation()
text_format.Parse(DEEPMAC_PROTO_TEXT, proto)
params = deepmac_meta_arch.deepmac_proto_to_params(proto)
self.assertIsInstance(params, deepmac_meta_arch.DeepMACParams)
self.assertEqual(params.dim, 153)
self.assertEqual(params.box_consistency_loss_normalize, 'normalize_auto')
def test_subsample_trivial(self): def test_subsample_trivial(self):
"""Test subsampling masks.""" """Test subsampling masks."""
...@@ -280,18 +330,126 @@ class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase): ...@@ -280,18 +330,126 @@ class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
self.assertAllClose(np.ones((8, 5, 5)), output[:, 1, :, :]) self.assertAllClose(np.ones((8, 5, 5)), output[:, 1, :, :])
self.assertAllClose([1, 0, 0, 0, 0, 0, 0, 1], output[:, 0, 2, 2]) self.assertAllClose([1, 0, 0, 0, 0, 0, 0, 1], output[:, 0, 2, 2])
def test_per_pixel_single_conv_multiple_instance(self):
inp = tf.zeros((5, 32, 32, 7))
params = tf.zeros((5, 7*8 + 8))
out = deepmac_meta_arch._per_pixel_single_conv(inp, params, 8)
self.assertEqual(out.shape, (5, 32, 32, 8))
def test_per_pixel_conditional_conv_error(self):
with self.assertRaises(ValueError):
deepmac_meta_arch.per_pixel_conditional_conv(
tf.zeros((10, 32, 32, 8)), tf.zeros((10, 2)), 8, 3)
def test_per_pixel_conditional_conv_error_tf_func(self):
with self.assertRaises(ValueError):
func = tf.function(deepmac_meta_arch.per_pixel_conditional_conv)
func(tf.zeros((10, 32, 32, 8)), tf.zeros((10, 2)), 8, 3)
def test_per_pixel_conditional_conv_depth1_error(self):
with self.assertRaises(ValueError):
_ = deepmac_meta_arch.per_pixel_conditional_conv(
tf.zeros((10, 32, 32, 7)), tf.zeros((10, 8)), 99, 1)
@parameterized.parameters([
{
'num_input_channels': 7,
'instance_embedding_dim': 8,
'channels': 7,
'depth': 1
},
{
'num_input_channels': 7,
'instance_embedding_dim': 82,
'channels': 9,
'depth': 2
},
{ # From https://arxiv.org/abs/2003.05664
'num_input_channels': 10,
'instance_embedding_dim': 169,
'channels': 8,
'depth': 3
},
{
'num_input_channels': 8,
'instance_embedding_dim': 433,
'channels': 16,
'depth': 3
},
{
'num_input_channels': 8,
'instance_embedding_dim': 1377,
'channels': 32,
'depth': 3
},
{
'num_input_channels': 8,
'instance_embedding_dim': 4801,
'channels': 64,
'depth': 3
},
])
def test_per_pixel_conditional_conv_shape(
self, num_input_channels, instance_embedding_dim, channels, depth):
out = deepmac_meta_arch.per_pixel_conditional_conv(
tf.zeros((10, 32, 32, num_input_channels)),
tf.zeros((10, instance_embedding_dim)), channels, depth)
self.assertEqual(out.shape, (10, 32, 32, 1))
def test_per_pixel_conditional_conv_value_depth1(self):
input_tensor = tf.constant(np.array([1, 2, 3]))
input_tensor = tf.reshape(input_tensor, (1, 1, 1, 3))
instance_embedding = tf.constant(
np.array([1, 10, 100, 1000]))
instance_embedding = tf.reshape(instance_embedding, (1, 4))
out = deepmac_meta_arch.per_pixel_conditional_conv(
input_tensor, instance_embedding, channels=3, depth=1)
expected_output = np.array([1321])
expected_output = np.reshape(expected_output, (1, 1, 1, 1))
self.assertAllClose(expected_output, out)
def test_per_pixel_conditional_conv_value_depth2_single(self):
input_tensor = tf.constant(np.array([2]))
input_tensor = tf.reshape(input_tensor, (1, 1, 1, 1))
instance_embedding = tf.constant(
np.array([-2, 3, 100, 5]))
instance_embedding = tf.reshape(instance_embedding, (1, 4))
out = deepmac_meta_arch.per_pixel_conditional_conv(
input_tensor, instance_embedding, channels=1, depth=2)
expected_output = np.array([5])
expected_output = np.reshape(expected_output, (1, 1, 1, 1))
self.assertAllClose(expected_output, out)
def test_per_pixel_conditional_conv_value_depth2_identity(self):
input_tensor = tf.constant(np.array([1, 2]))
input_tensor = tf.reshape(input_tensor, (1, 1, 1, 2))
instance_embedding = tf.constant(
np.array([1, 0, 0, 1, 1, -3, 5, 100, -9]))
instance_embedding = tf.reshape(
instance_embedding, (1, 9))
out = deepmac_meta_arch.per_pixel_conditional_conv(
input_tensor, instance_embedding, channels=2, depth=2)
expected_output = np.array([1])
expected_output = np.reshape(expected_output, (1, 1, 1, 1))
self.assertAllClose(expected_output, out)
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.') @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase): class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
['hourglass10', 'hourglass20', 'resnet4'])
def test_mask_network(self, head_type):
net = deepmac_meta_arch.MaskHeadNetwork(head_type, 8)
out = net(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
def test_mask_network_params_resnet4(self): def test_mask_network_params_resnet4(self):
net = deepmac_meta_arch.MaskHeadNetwork('resnet4', num_init_channels=8) net = deepmac_meta_arch.MaskHeadNetwork('resnet4', num_init_channels=8)
_ = net(tf.zeros((2, 16)), tf.zeros((2, 32, 32, 16)), training=True) _ = net(tf.zeros((2, 16)), tf.zeros((2, 32, 32, 16)), training=True)
...@@ -301,39 +459,93 @@ class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase): ...@@ -301,39 +459,93 @@ class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
self.assertEqual(trainable_params.numpy(), 8665) self.assertEqual(trainable_params.numpy(), 8665)
def test_mask_network_resnet_tf_function(self): def test_mask_network_embedding_projection_small(self):
net = deepmac_meta_arch.MaskHeadNetwork('resnet8')
call_func = tf.function(net.__call__)
out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
def test_mask_network_embedding_projection_zero(self):
net = deepmac_meta_arch.MaskHeadNetwork( net = deepmac_meta_arch.MaskHeadNetwork(
'embedding_projection', num_init_channels=8, 'embedding_projection', num_init_channels=-1,
use_instance_embedding=False) use_instance_embedding=False)
call_func = tf.function(net.__call__) call_func = tf.function(net.__call__)
out = call_func(tf.zeros((2, 7)), tf.zeros((2, 32, 32, 7)), training=True) out = call_func(1e6 + tf.zeros((2, 7)),
tf.zeros((2, 32, 32, 7)), training=True)
self.assertEqual(out.shape, (2, 32, 32)) self.assertEqual(out.shape, (2, 32, 32))
self.assertAllGreater(out.numpy(), -np.inf) self.assertAllGreater(out.numpy(), -np.inf)
self.assertAllLess(out.numpy(), np.inf) self.assertAllLess(out.numpy(), np.inf)
def test_mask_network_embedding_projection_small(self): @parameterized.parameters([
{
'mask_net': 'resnet4',
'mask_net_channels': 8,
'instance_embedding_dim': 4,
'input_channels': 16,
'use_instance_embedding': False
},
{
'mask_net': 'hourglass10',
'mask_net_channels': 8,
'instance_embedding_dim': 4,
'input_channels': 16,
'use_instance_embedding': False
},
{
'mask_net': 'hourglass20',
'mask_net_channels': 8,
'instance_embedding_dim': 4,
'input_channels': 16,
'use_instance_embedding': False
},
{
'mask_net': 'cond_inst3',
'mask_net_channels': 8,
'instance_embedding_dim': 153,
'input_channels': 8,
'use_instance_embedding': False
},
{
'mask_net': 'cond_inst3',
'mask_net_channels': 8,
'instance_embedding_dim': 169,
'input_channels': 10,
'use_instance_embedding': False
},
{
'mask_net': 'cond_inst1',
'mask_net_channels': 8,
'instance_embedding_dim': 9,
'input_channels': 8,
'use_instance_embedding': False
},
{
'mask_net': 'cond_inst2',
'mask_net_channels': 8,
'instance_embedding_dim': 81,
'input_channels': 8,
'use_instance_embedding': False
},
])
def test_mask_network(self, mask_net, mask_net_channels,
instance_embedding_dim, input_channels,
use_instance_embedding):
net = deepmac_meta_arch.MaskHeadNetwork( net = deepmac_meta_arch.MaskHeadNetwork(
'embedding_projection', num_init_channels=-1, mask_net, num_init_channels=mask_net_channels,
use_instance_embedding=False) use_instance_embedding=use_instance_embedding)
call_func = tf.function(net.__call__) call_func = tf.function(net.__call__)
out = call_func(1e6 + tf.zeros((2, 7)), out = call_func(tf.zeros((2, instance_embedding_dim)),
tf.zeros((2, 32, 32, 7)), training=True) tf.zeros((2, 32, 32, input_channels)), training=True)
self.assertEqual(out.shape, (2, 32, 32)) self.assertEqual(out.shape, (2, 32, 32))
self.assertAllGreater(out.numpy(), -np.inf) self.assertAllGreater(out.numpy(), -np.inf)
self.assertAllLess(out.numpy(), np.inf) self.assertAllLess(out.numpy(), np.inf)
out = call_func(tf.zeros((2, instance_embedding_dim)),
tf.zeros((2, 32, 32, input_channels)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
out = call_func(tf.zeros((0, instance_embedding_dim)),
tf.zeros((0, 32, 32, input_channels)), training=True)
self.assertEqual(out.shape, (0, 32, 32))
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.') @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase): class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
...@@ -619,8 +831,85 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase): ...@@ -619,8 +831,85 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
xloss = tf.nn.sigmoid_cross_entropy_with_logits( xloss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.constant([1.0] * 16), labels=tf.constant([1.0] * 16),
logits=[1.0] * 12 + [0.0] * 4) logits=[1.0] * 12 + [0.0] * 4)
yloss_mean = tf.reduce_mean(yloss)
xloss_mean = tf.reduce_mean(xloss)
self.assertAllClose(loss, [yloss_mean + xloss_mean])
def test_box_consistency_loss_with_tightness(self):
boxes_gt = tf.constant([[0., 0., 0.49, 0.49]])
boxes_jittered = None
mask_prediction = np.zeros((1, 8, 8)).astype(np.float32) - 1e10
mask_prediction[0, :4, :4] = 1e10
self.assertAllClose(loss, [tf.reduce_mean(yloss + xloss).numpy()]) model = build_meta_arch(box_consistency_tightness=True,
predict_full_resolution_masks=True)
loss = model._compute_per_instance_box_consistency_loss(
boxes_gt, boxes_jittered, tf.constant(mask_prediction))
self.assertAllClose(loss, [0.0])
def test_box_consistency_loss_gt_count(self):
boxes_gt = tf.constant([
[0., 0., 1.0, 1.0],
[0., 0., 0.49, 0.49]])
boxes_jittered = None
mask_prediction = np.zeros((2, 32, 32)).astype(np.float32)
mask_prediction[0, :16, :16] = 1.0
mask_prediction[1, :8, :8] = 1.0
model = build_meta_arch(
box_consistency_loss_normalize='normalize_groundtruth_count',
predict_full_resolution_masks=True)
loss_func = tf.function(
model._compute_per_instance_box_consistency_loss)
loss = loss_func(
boxes_gt, boxes_jittered, tf.constant(mask_prediction))
yloss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.constant([1.0] * 32),
logits=[1.0] * 16 + [0.0] * 16) / 32.0
yloss_mean = tf.reduce_sum(yloss)
xloss = yloss
xloss_mean = tf.reduce_sum(xloss)
self.assertAllClose(loss[0], yloss_mean + xloss_mean)
yloss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.constant([1.0] * 16 + [0.0] * 16),
logits=[1.0] * 8 + [0.0] * 24) / 16.0
yloss_mean = tf.reduce_sum(yloss)
xloss = yloss
xloss_mean = tf.reduce_sum(xloss)
self.assertAllClose(loss[1], yloss_mean + xloss_mean)
def test_box_consistency_loss_balanced(self):
boxes_gt = tf.constant([
[0., 0., 0.49, 0.49]])
boxes_jittered = None
mask_prediction = np.zeros((1, 32, 32)).astype(np.float32)
mask_prediction[0] = 1.0
model = build_meta_arch(box_consistency_loss_normalize='normalize_balanced',
predict_full_resolution_masks=True)
loss_func = tf.function(
model._compute_per_instance_box_consistency_loss)
loss = loss_func(
boxes_gt, boxes_jittered, tf.constant(mask_prediction))
yloss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=[0.] * 16 + [1.0] * 16,
logits=[1.0] * 32)
yloss_mean = tf.reduce_sum(yloss) / 16.0
xloss_mean = yloss_mean
self.assertAllClose(loss[0], yloss_mean + xloss_mean)
def test_box_consistency_dice_loss(self): def test_box_consistency_dice_loss(self):
...@@ -701,34 +990,145 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase): ...@@ -701,34 +990,145 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
loss = model.loss(prediction, tf.constant([[32, 32, 3.0]])) loss = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0) self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0)
for weak_loss in deepmac_meta_arch.WEAK_LOSSES: for weak_loss in deepmac_meta_arch.MASK_LOSSES:
if weak_loss == deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY: if weak_loss == deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY:
continue continue
self.assertGreater(loss['Loss/' + weak_loss], 0.0, self.assertGreater(loss['Loss/' + weak_loss], 0.0,
'{} was <= 0'.format(weak_loss)) '{} was <= 0'.format(weak_loss))
def test_loss_keys_full_res(self): def test_loss_weight_response(self):
model = build_meta_arch(use_dice_loss=True, model = build_meta_arch(
predict_full_resolution_masks=True) use_dice_loss=True,
predict_full_resolution_masks=True,
network_type='cond_inst1',
dim=9,
pixel_embedding_dim=8,
use_instance_embedding=False,
use_xy=False)
num_stages = 1
prediction = { prediction = {
'preprocessed_inputs': tf.random.normal((1, 32, 32, 3)), 'preprocessed_inputs': tf.random.normal((1, 32, 32, 3)),
'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 17))] * 2, 'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 9))] * num_stages,
'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 19))] * 2, 'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 8))] * num_stages,
'object_center': [tf.random.normal((1, 8, 8, 6))] * 2, 'object_center': [tf.random.normal((1, 8, 8, 6))] * num_stages,
'box/offset': [tf.random.normal((1, 8, 8, 2))] * 2, 'box/offset': [tf.random.normal((1, 8, 8, 2))] * num_stages,
'box/scale': [tf.random.normal((1, 8, 8, 2))] * 2 'box/scale': [tf.random.normal((1, 8, 8, 2))] * num_stages
} }
boxes = [tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)]
classes = [tf.one_hot([1, 0, 1, 1, 1], depth=6)]
weights = [tf.ones(5)]
masks = [tf.ones((5, 32, 32))]
model.provide_groundtruth( model.provide_groundtruth(
groundtruth_boxes_list=[tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)], groundtruth_boxes_list=boxes,
groundtruth_classes_list=[tf.one_hot([1, 0, 1, 1, 1], depth=6)], groundtruth_classes_list=classes,
groundtruth_weights_list=[tf.ones(5)], groundtruth_weights_list=weights,
groundtruth_masks_list=[tf.ones((5, 32, 32))]) groundtruth_masks_list=masks)
loss = model.loss(prediction, tf.constant([[32, 32, 3.0]])) loss = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0) self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0)
for weak_loss in deepmac_meta_arch.WEAK_LOSSES: for mask_loss in deepmac_meta_arch.MASK_LOSSES:
self.assertGreater(loss['Loss/' + weak_loss], 0.0, self.assertGreater(loss['Loss/' + mask_loss], 0.0,
'{} was <= 0'.format(weak_loss)) '{} was <= 0'.format(mask_loss))
rng = random.Random(0)
loss_weights = {
deepmac_meta_arch.DEEP_MASK_ESTIMATION: rng.uniform(1, 5),
deepmac_meta_arch.DEEP_MASK_BOX_CONSISTENCY: rng.uniform(1, 5),
deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY: rng.uniform(1, 5)
}
weighted_model = build_meta_arch(
use_dice_loss=True,
predict_full_resolution_masks=True,
network_type='cond_inst1',
dim=9,
pixel_embedding_dim=8,
use_instance_embedding=False,
use_xy=False,
task_loss_weight=loss_weights[deepmac_meta_arch.DEEP_MASK_ESTIMATION],
box_consistency_loss_weight=(
loss_weights[deepmac_meta_arch.DEEP_MASK_BOX_CONSISTENCY]),
color_consistency_loss_weight=(
loss_weights[deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY]))
weighted_model.provide_groundtruth(
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks)
weighted_loss = weighted_model.loss(prediction, tf.constant([[32, 32, 3]]))
for mask_loss in deepmac_meta_arch.MASK_LOSSES:
loss_key = 'Loss/' + mask_loss
self.assertAllEqual(
weighted_loss[loss_key], loss[loss_key] * loss_weights[mask_loss],
f'{mask_loss} did not respond to change in weight.')
def test_color_consistency_warmup(self):
model = build_meta_arch(
use_dice_loss=True,
predict_full_resolution_masks=True,
network_type='cond_inst1',
dim=9,
pixel_embedding_dim=8,
use_instance_embedding=False,
use_xy=False,
color_consistency_warmup_steps=10,
color_consistency_warmup_start=10)
num_stages = 1
prediction = {
'preprocessed_inputs': tf.random.normal((1, 32, 32, 3)),
'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 9))] * num_stages,
'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 8))] * num_stages,
'object_center': [tf.random.normal((1, 8, 8, 6))] * num_stages,
'box/offset': [tf.random.normal((1, 8, 8, 2))] * num_stages,
'box/scale': [tf.random.normal((1, 8, 8, 2))] * num_stages
}
boxes = [tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)]
classes = [tf.one_hot([1, 0, 1, 1, 1], depth=6)]
weights = [tf.ones(5)]
masks = [tf.ones((5, 32, 32))]
model.provide_groundtruth(
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks,
training_step=5)
loss_at_5 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
model.provide_groundtruth(
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks,
training_step=15)
loss_at_15 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
model.provide_groundtruth(
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks,
training_step=20)
loss_at_20 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
model.provide_groundtruth(
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks,
training_step=100)
loss_at_100 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
loss_key = 'Loss/' + deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY
self.assertAlmostEqual(loss_at_5[loss_key].numpy(), 0.0)
self.assertAlmostEqual(loss_at_15[loss_key].numpy(),
loss_at_20[loss_key].numpy() / 2.0)
self.assertAlmostEqual(loss_at_20[loss_key].numpy(),
loss_at_100[loss_key].numpy())
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.') @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
......
...@@ -114,6 +114,10 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic, ...@@ -114,6 +114,10 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
'groundtruth_not_exhaustive_classes': [batch_size, num_classes] K-hot 'groundtruth_not_exhaustive_classes': [batch_size, num_classes] K-hot
representation of 1-indexed classes which don't have all of their representation of 1-indexed classes which don't have all of their
instances marked exhaustively. instances marked exhaustively.
'input_data_fields.groundtruth_image_classes': integer representation of
the classes that were sent for verification for a given image. Note that
this field does not support batching as the number of classes can be
variable.
class_agnostic: Boolean indicating whether detections are class agnostic. class_agnostic: Boolean indicating whether detections are class agnostic.
""" """
input_data_fields = fields.InputDataFields() input_data_fields = fields.InputDataFields()
...@@ -136,6 +140,18 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic, ...@@ -136,6 +140,18 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
input_data_fields.groundtruth_classes: groundtruth_classes input_data_fields.groundtruth_classes: groundtruth_classes
} }
if detection_model.groundtruth_has_field(
input_data_fields.groundtruth_image_classes):
groundtruth_image_classes_k_hot = tf.stack(
detection_model.groundtruth_lists(
input_data_fields.groundtruth_image_classes))
# We do not add label_id_offset here because it was not added when encoding
# groundtruth_image_classes.
groundtruth_image_classes = tf.expand_dims(
tf.where(groundtruth_image_classes_k_hot > 0)[:, 1], 0)
groundtruth[
input_data_fields.groundtruth_image_classes] = groundtruth_image_classes
if detection_model.groundtruth_has_field(fields.BoxListFields.masks): if detection_model.groundtruth_has_field(fields.BoxListFields.masks):
groundtruth[input_data_fields.groundtruth_instance_masks] = tf.stack( groundtruth[input_data_fields.groundtruth_instance_masks] = tf.stack(
detection_model.groundtruth_lists(fields.BoxListFields.masks)) detection_model.groundtruth_lists(fields.BoxListFields.masks))
...@@ -303,7 +319,7 @@ def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True): ...@@ -303,7 +319,7 @@ def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True):
return unbatched_tensor_dict return unbatched_tensor_dict
def provide_groundtruth(model, labels): def provide_groundtruth(model, labels, training_step=None):
"""Provides the labels to a model as groundtruth. """Provides the labels to a model as groundtruth.
This helper function extracts the corresponding boxes, classes, This helper function extracts the corresponding boxes, classes,
...@@ -313,6 +329,8 @@ def provide_groundtruth(model, labels): ...@@ -313,6 +329,8 @@ def provide_groundtruth(model, labels):
Args: Args:
model: The detection model to provide groundtruth to. model: The detection model to provide groundtruth to.
labels: The labels for the training or evaluation inputs. labels: The labels for the training or evaluation inputs.
training_step: int, optional. The training step for the model. Useful
for models which want to anneal loss weights.
""" """
gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes] gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
gt_classes_list = labels[fields.InputDataFields.groundtruth_classes] gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
...@@ -382,6 +400,10 @@ def provide_groundtruth(model, labels): ...@@ -382,6 +400,10 @@ def provide_groundtruth(model, labels):
if fields.InputDataFields.groundtruth_not_exhaustive_classes in labels: if fields.InputDataFields.groundtruth_not_exhaustive_classes in labels:
gt_not_exhaustive_classes = labels[ gt_not_exhaustive_classes = labels[
fields.InputDataFields.groundtruth_not_exhaustive_classes] fields.InputDataFields.groundtruth_not_exhaustive_classes]
groundtruth_image_classes = None
if fields.InputDataFields.groundtruth_image_classes in labels:
groundtruth_image_classes = labels[
fields.InputDataFields.groundtruth_image_classes]
model.provide_groundtruth( model.provide_groundtruth(
groundtruth_boxes_list=gt_boxes_list, groundtruth_boxes_list=gt_boxes_list,
groundtruth_classes_list=gt_classes_list, groundtruth_classes_list=gt_classes_list,
...@@ -402,7 +424,9 @@ def provide_groundtruth(model, labels): ...@@ -402,7 +424,9 @@ def provide_groundtruth(model, labels):
groundtruth_verified_neg_classes=gt_verified_neg_classes, groundtruth_verified_neg_classes=gt_verified_neg_classes,
groundtruth_not_exhaustive_classes=gt_not_exhaustive_classes, groundtruth_not_exhaustive_classes=gt_not_exhaustive_classes,
groundtruth_keypoint_depths_list=gt_keypoint_depths_list, groundtruth_keypoint_depths_list=gt_keypoint_depths_list,
groundtruth_keypoint_depth_weights_list=gt_keypoint_depth_weights_list) groundtruth_keypoint_depth_weights_list=gt_keypoint_depth_weights_list,
groundtruth_image_classes=groundtruth_image_classes,
training_step=training_step)
def create_model_fn(detection_model_fn, configs, hparams=None, use_tpu=False, def create_model_fn(detection_model_fn, configs, hparams=None, use_tpu=False,
......
...@@ -51,7 +51,7 @@ RESTORE_MAP_ERROR_TEMPLATE = ( ...@@ -51,7 +51,7 @@ RESTORE_MAP_ERROR_TEMPLATE = (
def _compute_losses_and_predictions_dicts( def _compute_losses_and_predictions_dicts(
model, features, labels, model, features, labels, training_step=None,
add_regularization_loss=True): add_regularization_loss=True):
"""Computes the losses dict and predictions dict for a model on inputs. """Computes the losses dict and predictions dict for a model on inputs.
...@@ -107,6 +107,7 @@ def _compute_losses_and_predictions_dicts( ...@@ -107,6 +107,7 @@ def _compute_losses_and_predictions_dicts(
float32 tensor containing keypoint depths information. float32 tensor containing keypoint depths information.
labels[fields.InputDataFields.groundtruth_keypoint_depth_weights] is a labels[fields.InputDataFields.groundtruth_keypoint_depth_weights] is a
float32 tensor containing the weights of the keypoint depth feature. float32 tensor containing the weights of the keypoint depth feature.
training_step: int, the current training step.
add_regularization_loss: Whether or not to include the model's add_regularization_loss: Whether or not to include the model's
regularization loss in the losses dictionary. regularization loss in the losses dictionary.
...@@ -116,7 +117,7 @@ def _compute_losses_and_predictions_dicts( ...@@ -116,7 +117,7 @@ def _compute_losses_and_predictions_dicts(
`model.predict`. `model.predict`.
""" """
model_lib.provide_groundtruth(model, labels) model_lib.provide_groundtruth(model, labels, training_step=training_step)
preprocessed_images = features[fields.InputDataFields.image] preprocessed_images = features[fields.InputDataFields.image]
prediction_dict = model.predict( prediction_dict = model.predict(
...@@ -166,7 +167,8 @@ def _ensure_model_is_built(model, input_dataset, unpad_groundtruth_tensors): ...@@ -166,7 +167,8 @@ def _ensure_model_is_built(model, input_dataset, unpad_groundtruth_tensors):
labels = model_lib.unstack_batch( labels = model_lib.unstack_batch(
labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)
return _compute_losses_and_predictions_dicts(model, features, labels) return _compute_losses_and_predictions_dicts(model, features, labels,
training_step=0)
strategy = tf.compat.v2.distribute.get_strategy() strategy = tf.compat.v2.distribute.get_strategy()
if hasattr(tf.distribute.Strategy, 'run'): if hasattr(tf.distribute.Strategy, 'run'):
...@@ -208,6 +210,7 @@ def eager_train_step(detection_model, ...@@ -208,6 +210,7 @@ def eager_train_step(detection_model,
labels, labels,
unpad_groundtruth_tensors, unpad_groundtruth_tensors,
optimizer, optimizer,
training_step,
add_regularization_loss=True, add_regularization_loss=True,
clip_gradients_value=None, clip_gradients_value=None,
num_replicas=1.0): num_replicas=1.0):
...@@ -280,6 +283,7 @@ def eager_train_step(detection_model, ...@@ -280,6 +283,7 @@ def eager_train_step(detection_model,
float32 tensor containing the weights of the keypoint depth feature. float32 tensor containing the weights of the keypoint depth feature.
unpad_groundtruth_tensors: A parameter passed to unstack_batch. unpad_groundtruth_tensors: A parameter passed to unstack_batch.
optimizer: The training optimizer that will update the variables. optimizer: The training optimizer that will update the variables.
training_step: int, the training step number.
add_regularization_loss: Whether or not to include the model's add_regularization_loss: Whether or not to include the model's
regularization loss in the losses dictionary. regularization loss in the losses dictionary.
clip_gradients_value: If this is present, clip the gradients global norm clip_gradients_value: If this is present, clip the gradients global norm
...@@ -302,7 +306,9 @@ def eager_train_step(detection_model, ...@@ -302,7 +306,9 @@ def eager_train_step(detection_model,
with tf.GradientTape() as tape: with tf.GradientTape() as tape:
losses_dict, _ = _compute_losses_and_predictions_dicts( losses_dict, _ = _compute_losses_and_predictions_dicts(
detection_model, features, labels, add_regularization_loss) detection_model, features, labels,
training_step=training_step,
add_regularization_loss=add_regularization_loss)
losses_dict = normalize_dict(losses_dict, num_replicas) losses_dict = normalize_dict(losses_dict, num_replicas)
...@@ -632,6 +638,7 @@ def train_loop( ...@@ -632,6 +638,7 @@ def train_loop(
labels, labels,
unpad_groundtruth_tensors, unpad_groundtruth_tensors,
optimizer, optimizer,
training_step=global_step,
add_regularization_loss=add_regularization_loss, add_regularization_loss=add_regularization_loss,
clip_gradients_value=clip_gradients_value, clip_gradients_value=clip_gradients_value,
num_replicas=strategy.num_replicas_in_sync) num_replicas=strategy.num_replicas_in_sync)
...@@ -901,7 +908,8 @@ def eager_eval_loop( ...@@ -901,7 +908,8 @@ def eager_eval_loop(
labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)
losses_dict, prediction_dict = _compute_losses_and_predictions_dicts( losses_dict, prediction_dict = _compute_losses_and_predictions_dicts(
detection_model, features, labels, add_regularization_loss) detection_model, features, labels, training_step=None,
add_regularization_loss=add_regularization_loss)
prediction_dict = detection_model.postprocess( prediction_dict = detection_model.postprocess(
prediction_dict, features[fields.InputDataFields.true_image_shape]) prediction_dict, features[fields.InputDataFields.true_image_shape])
eval_features = { eval_features = {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment