Unverified Commit 09d9656f authored by Srihari Humbarwadi's avatar Srihari Humbarwadi Committed by GitHub
Browse files

Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

parents ac671306 49a5706c
......@@ -135,7 +135,15 @@ class SemanticSegmentationTask(base_task.Task):
use_groundtruth_dimension=loss_params.use_groundtruth_dimension,
top_k_percent_pixels=loss_params.top_k_percent_pixels)
total_loss = segmentation_loss_fn(model_outputs, labels['masks'])
total_loss = segmentation_loss_fn(model_outputs['logits'], labels['masks'])
if 'mask_scores' in model_outputs:
mask_scoring_loss_fn = segmentation_losses.MaskScoringLoss(
loss_params.ignore_label)
total_loss += mask_scoring_loss_fn(
model_outputs['mask_scores'],
model_outputs['logits'],
labels['masks'])
if aux_losses:
total_loss += tf.add_n(aux_losses)
......@@ -144,6 +152,28 @@ class SemanticSegmentationTask(base_task.Task):
return total_loss
def process_metrics(self, metrics, labels, model_outputs, **kwargs):
"""Process and update metrics.
Called when using custom training loop API.
Args:
metrics: a nested structure of metrics objects. The return of function
self.build_metrics.
labels: a tensor or a nested structure of tensors.
model_outputs: a tensor or a nested structure of tensors. For example,
output of the keras model built by self.build_model.
**kwargs: other args.
"""
for metric in metrics:
if 'mask_scores_mse' is metric.name:
actual_mask_scores = segmentation_losses.get_actual_mask_scores(
model_outputs['logits'], labels['masks'],
self.task_config.losses.ignore_label)
metric.update_state(actual_mask_scores, model_outputs['mask_scores'])
else:
metric.update_state(labels, model_outputs['logits'])
def build_metrics(self, training: bool = True):
"""Gets streaming metrics for training/validation."""
metrics = []
......@@ -153,6 +183,9 @@ class SemanticSegmentationTask(base_task.Task):
num_classes=self.task_config.model.num_classes,
rescale_predictions=False,
dtype=tf.float32))
if self.task_config.model.get('mask_scoring_head'):
metrics.append(
tf.keras.metrics.MeanSquaredError(name='mask_scores_mse'))
else:
self.iou_metric = segmentation_metrics.PerClassIoU(
name='per_class_iou',
......@@ -160,6 +193,11 @@ class SemanticSegmentationTask(base_task.Task):
rescale_predictions=not self.task_config.validation_data
.resize_eval_groundtruth,
dtype=tf.float32)
if self.task_config.validation_data.resize_eval_groundtruth and self.task_config.model.get('mask_scoring_head'): # pylint: disable=line-too-long
# Masks scores metric can only be computed if labels are scaled to match
# preticted mask scores.
metrics.append(
tf.keras.metrics.MeanSquaredError(name='mask_scores_mse'))
# Update state on CPU if TPUStrategy due to dynamic resizing.
self._process_iou_metric_on_cpu = isinstance(
......@@ -194,6 +232,8 @@ class SemanticSegmentationTask(base_task.Task):
num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
with tf.GradientTape() as tape:
outputs = model(features, training=True)
if isinstance(outputs, tf.Tensor):
outputs = {'logits': outputs}
# Casting output layer as float32 is necessary when mixed_precision is
# mixed_float16 or mixed_bfloat16 to ensure output is casted as float32.
outputs = tf.nest.map_structure(
......@@ -249,6 +289,8 @@ class SemanticSegmentationTask(base_task.Task):
features, input_partition_dims)
outputs = self.inference_step(features, model)
if isinstance(outputs, tf.Tensor):
outputs = {'logits': outputs}
outputs = tf.nest.map_structure(lambda x: tf.cast(x, tf.float32), outputs)
if self.task_config.validation_data.resize_eval_groundtruth:
......@@ -260,9 +302,9 @@ class SemanticSegmentationTask(base_task.Task):
logs = {self.loss: loss}
if self._process_iou_metric_on_cpu:
logs.update({self.iou_metric.name: (labels, outputs)})
logs.update({self.iou_metric.name: (labels, outputs['logits'])})
else:
self.iou_metric.update_state(labels, outputs)
self.iou_metric.update_state(labels, outputs['logits'])
if metrics:
self.process_metrics(metrics, labels, outputs)
......
# Object Detection Models on TensorFlow 2
**WARNING**: This repository will be deprecated and replaced by the solid
implementations inside vision/beta/.
This repository is deprecated and replaced by the solid
implementations inside vision/beta/. All the content has been moved to
[official/legacy/detection](https://github.com/tensorflow/models/tree/master/official/legacy/detection).
## Prerequsite
To get started, download the code from TensorFlow models GitHub repository or
use the pre-installed Google Cloud VM.
```bash
git clone https://github.com/tensorflow/models.git
```
Next, make sure to use TensorFlow 2.1+ on Google Cloud. Also here are
a few package you need to install to get started:
```bash
sudo apt-get install -y python-tk && \
pip3 install -r ~/models/official/requirements.txt
```
## Train RetinaNet on TPU
### Train a vanilla ResNet-50 based RetinaNet.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--params_override="{ type: retinanet, train: { checkpoint: { path: ${RESNET_CHECKPOINT?}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
```
The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
Note: The ResNet implementation under
[detection/](https://github.com/tensorflow/models/tree/master/official/vision/detection)
is currently different from the one under
[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
so the checkpoints are not compatible.
We will unify the implementation soon.
### Train a SpineNet-49 based RetinaNet.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--params_override="{ type: retinanet, architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
```
### Train a custom RetinaNet using the config file.
First, create a YAML config file, e.g. *my_retinanet.yaml*. This file specifies
the parameters to be overridden, which should at least include the following
fields.
```YAML
# my_retinanet.yaml
type: 'retinanet'
train:
train_file_pattern: <path to the TFRecord training data>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--config_file="my_retinanet.yaml"
```
## Train RetinaNet on GPU
Training on GPU is similar to that on TPU. The major change is the strategy
type (use "[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)" for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)" for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--config_file="my_retinanet.yaml"
```
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--config_file="my_retinanet.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/vision/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
predict:
predict_batch_size: 8
architecture:
use_bfloat16: False
train:
total_steps: 1
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
---
## Train Mask R-CNN on TPU
### Train a vanilla ResNet-50 based Mask R-CNN.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } }"
```
The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
Note: The ResNet implementation under
[detection/](https://github.com/tensorflow/models/tree/master/official/vision/detection)
is currently different from the one under
[classification/](https://github.com/tensorflow/models/tree/master/official/vision/image_classification),
so the checkpoints are not compatible.
We will unify the implementation soon.
### Train a SpineNet-49 based Mask R-CNN.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu="${TPU_NAME?}" \
--model_dir="${MODEL_DIR?}" \
--mode=train \
--model=mask_rcnn \
--params_override="{architecture: {backbone: spinenet, multilevel_features: identity}, spinenet: {model_id: 49}, train_file_pattern: ${TRAIN_FILE_PATTERN?} }, eval: { val_json_file: ${VAL_JSON_FILE?}, eval_file_pattern: ${EVAL_FILE_PATTERN?} } }"
```
### Train a custom Mask R-CNN using the config file.
First, create a YAML config file, e.g. *my_maskrcnn.yaml*.
This file specifies the parameters to be overridden,
which should at least include the following fields.
```YAML
# my_maskrcnn.yaml
train:
train_file_pattern: <path to the TFRecord training data>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
## Train Mask R-CNN on GPU
Training on GPU is similar to that on TPU. The major change is the strategy type
(use
"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=mask_rcnn \
--config_file="my_maskrcnn.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/vision/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--model=mask_rcnn \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
predict:
predict_batch_size: 8
architecture:
use_bfloat16: False
train:
total_steps: 1000
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
## Train ShapeMask on TPU
### Train a ResNet-50 based ShapeMask.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
RESNET_CHECKPOINT="<path to the pre-trained Resnet-50 checkpoint>"
TRAIN_FILE_PATTERN="<path to the TFRecord training data>"
EVAL_FILE_PATTERN="<path to the TFRecord validation data>"
VAL_JSON_FILE="<path to the validation annotation JSON file>"
SHAPE_PRIOR_PATH="<path to shape priors>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--params_override="{train: { checkpoint: { path: ${RESNET_CHECKPOINT}, prefix: resnet50/ }, train_file_pattern: ${TRAIN_FILE_PATTERN} }, eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN} } shapemask_head: {use_category_for_mask: true, shape_prior_path: ${SHAPE_PRIOR_PATH}} }"
```
The pre-trained ResNet-50 checkpoint can be downloaded [here](https://storage.cloud.google.com/cloud-tpu-checkpoints/model-garden-vision/detection/resnet50-2018-02-07.tar.gz).
The shape priors can be downloaded [here]
(https://storage.googleapis.com/cloud-tpu-checkpoints/shapemask/kmeans_class_priors_91x20x32x32.npy)
### Train a custom ShapeMask using the config file.
First, create a YAML config file, e.g. *my_shapemask.yaml*.
This file specifies the parameters to be overridden:
```YAML
# my_shapemask.yaml
train:
train_file_pattern: <path to the TFRecord training data>
total_steps: <total steps to train>
batch_size: <training batch size>
eval:
eval_file_pattern: <path to the TFRecord validation data>
val_json_file: <path to the validation annotation JSON file>
batch_size: <evaluation batch size>
shapemask_head:
shape_prior_path: <path to shape priors>
```
Once the YAML config file is created, you can launch the training using the
following command.
```bash
TPU_NAME="<your GCP TPU name>"
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--config_file="my_shapemask.yaml"
```
## Train ShapeMask on GPU
Training on GPU is similar to that on TPU. The major change is the strategy type
(use
"[mirrored](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)"
for multiple GPU and
"[one_device](https://www.tensorflow.org/api_docs/python/tf/distribute/OneDeviceStrategy)"
for single GPU).
Multi-GPUs example (assuming there are 8GPU connected to the host):
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=mirrored \
--num_gpus=8 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--config_file="my_shapemask.yaml"
```
A single GPU example
```bash
MODEL_DIR="<path to the directory to store model files>"
python3 ~/models/official/vision/detection/main.py \
--strategy_type=one_device \
--num_gpus=1 \
--model_dir=${MODEL_DIR} \
--mode=train \
--model=shapemask \
--config_file="my_shapemask.yaml"
```
An example with inline configuration (YAML or JSON format):
```
python3 ~/models/official/vision/detection/main.py \
--model_dir=<model folder> \
--strategy_type=one_device \
--num_gpus=1 \
--mode=train \
--model=shapemask \
--params_override="eval:
eval_file_pattern: <Eval TFRecord file pattern>
batch_size: 8
val_json_file: <COCO format groundtruth JSON file>
train:
total_steps: 1000
batch_size: 8
train_file_pattern: <Eval TFRecord file pattern>
use_tpu: False
"
```
### Run the evaluation (after training)
```
python3 /usr/share/models/official/vision/detection/main.py \
--strategy_type=tpu \
--tpu=${TPU_NAME} \
--model_dir=${MODEL_DIR} \
--mode=eval \
--model=shapemask \
--params_override="{eval: { val_json_file: ${VAL_JSON_FILE}, eval_file_pattern: ${EVAL_FILE_PATTERN}, eval_samples: 5000 } }"
```
`MODEL_DIR` needs to point to the trained path of ShapeMask model.
Change `strategy_type=mirrored` and `num_gpus=1` to run on a GPU.
Note: The JSON groundtruth file is useful for [COCO dataset](http://cocodataset.org/#home) and can be
downloaded from the [COCO website](http://cocodataset.org/#download). For custom dataset, it is unncessary because the groundtruth can be included in the TFRecord files.
## References
1. [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002).
Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, and Piotr Dollár. IEEE
International Conference on Computer Vision (ICCV), 2017.
......@@ -12,3 +12,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Deprecating the vision/detection folder."""
raise ImportError('This module has been moved to official/legacy/detection')
# Image Classification
**Warning:** the features in the `image_classification/` folder have been fully
intergrated into vision/beta. Please use the [new code base](../beta/README.md).
This folder contains TF 2.0 model examples for image classification:
* [MNIST](#mnist)
* [Classifier Trainer](#classifier-trainer), a framework that uses the Keras
compile/fit methods for image classification models, including:
* ResNet
* EfficientNet[^1]
[^1]: Currently a work in progress. We cannot match "AutoAugment (AA)" in [the original version](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet).
For more information about other types of models, please refer to this
[README file](../../README.md).
## Before you begin
Please make sure that you have the latest version of TensorFlow
installed and
[add the models folder to your Python path](/official/#running-the-models).
### ImageNet preparation
#### Using TFDS
`classifier_trainer.py` supports ImageNet with
[TensorFlow Datasets (TFDS)](https://www.tensorflow.org/datasets/overview).
Please see the following [example snippet](https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/scripts/download_and_prepare.py)
for more information on how to use TFDS to download and prepare datasets, and
specifically the [TFDS ImageNet readme](https://github.com/tensorflow/datasets/blob/master/docs/catalog/imagenet2012.md)
for manual download instructions.
#### Legacy TFRecords
Download the ImageNet dataset and convert it to TFRecord format.
The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
provide a few options.
Note that the legacy ResNet runners, e.g. [resnet/resnet_ctl_imagenet_main.py](resnet/resnet_ctl_imagenet_main.py)
require TFRecords whereas `classifier_trainer.py` can use both by setting the
builder to 'records' or 'tfds' in the configurations.
### Running on Cloud TPUs
Note: These models will **not** work with TPUs on Colab.
You can train image classification models on Cloud TPUs using
[tf.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf.distribute.TPUStrategy?version=nightly).
If you are not familiar with Cloud TPUs, it is strongly recommended that you go
through the
[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
create a TPU and GCE VM.
### Running on multiple GPU hosts
You can also train these models on multiple hosts, each with GPUs, using
[tf.distribute.experimental.MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy).
The easiest way to run multi-host benchmarks is to set the
[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
appropriately at each host. e.g., to run using `MultiWorkerMirroredStrategy` on
2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
"index": i}`. `MultiWorkerMirroredStrategy` will automatically use all the
available GPUs at each host.
## MNIST
To download the data and run the MNIST sample model locally for the first time,
run one of the following command:
```bash
python3 mnist_main.py \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--train_epochs=10 \
--distribution_strategy=one_device \
--num_gpus=$NUM_GPUS \
--download
```
To train the model on a Cloud TPU, run the following command:
```bash
python3 mnist_main.py \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--train_epochs=10 \
--distribution_strategy=tpu \
--download
```
Note: the `--download` flag is only required the first time you run the model.
## Classifier Trainer
The classifier trainer is a unified framework for running image classification
models using Keras's compile/fit methods. Experiments should be provided in the
form of YAML files, some examples are included within the configs/examples
folder. Please see [configs/examples](./configs/examples) for more example
configurations.
The provided configuration files use a per replica batch size and is scaled
by the number of devices. For instance, if `batch size` = 64, then for 1 GPU
the global batch size would be 64 * 1 = 64. For 8 GPUs, the global batch size
would be 64 * 8 = 512. Similarly, for a v3-8 TPU, the global batch size would
be 64 * 8 = 512, and for a v3-32, the global batch size is 64 * 32 = 2048.
### ResNet50
#### On GPU:
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=resnet \
--dataset=imagenet \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/resnet/imagenet/gpu.yaml \
--params_override='runtime.num_gpus=$NUM_GPUS'
```
To train on multiple hosts, each with GPUs attached using
[MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy)
please update `runtime` section in gpu.yaml
(or override using `--params_override`) with:
```YAML
# gpu.yaml
runtime:
distribution_strategy: 'multi_worker_mirrored'
worker_hosts: '$HOST1:port,$HOST2:port'
num_gpus: $NUM_GPUS
task_index: 0
```
By having `task_index: 0` on the first host and `task_index: 1` on the second
and so on. `$HOST1` and `$HOST2` are the IP addresses of the hosts, and `port`
can be chosen any free port on the hosts. Only the first host will write
TensorBoard Summaries and save checkpoints.
#### On TPU:
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=resnet \
--dataset=imagenet \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/resnet/imagenet/tpu.yaml
```
### EfficientNet
**Note: EfficientNet development is a work in progress.**
#### On GPU:
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=efficientnet \
--dataset=imagenet \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml \
--params_override='runtime.num_gpus=$NUM_GPUS'
```
#### On TPU:
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=efficientnet \
--dataset=imagenet \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
```
Note that the number of GPU devices can be overridden in the command line using
`--params_overrides`. The TPU does not need this override as the device is fixed
by providing the TPU address or name with the `--tpu` flag.
This repository is deprecated and replaced by the solid
implementations inside vision/beta/. All the content has been moved to
[official/legacy/image_classification](https://github.com/tensorflow/models/tree/master/official/legacy/image_classification).
......@@ -12,3 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Deprecating the vision/detection folder."""
raise ImportError(
'This module has been moved to official/legacy/image_classification')
......@@ -14,6 +14,7 @@
"""Utilities for creating loop functions."""
from absl import logging
from orbit.utils import tpu_summaries
import tensorflow as tf
......@@ -65,8 +66,8 @@ def create_loop_fn(step_fn):
The final state returned by `reduce_fn`, or `None` if `state` and
`reduce_fn` are not provided.
"""
step = 0
try:
step = 0
# To make sure the OutOfRangeError exception can be handled well under
# async remote eager, we need to wrap the loop body in `async_scope`.
with tf.experimental.async_scope():
......@@ -77,6 +78,7 @@ def create_loop_fn(step_fn):
step += 1
return state
except (StopIteration, tf.errors.OutOfRangeError):
logging.info("The dataset iterator is exhausted after %d steps.", step)
tf.experimental.async_clear_error()
return state
......
## Global features: CNN Image Retrieval
This Python toolbox implements the training and testing of the approach described in the papers:
[![Paper](http://img.shields.io/badge/paper-arXiv.2001.05027-B3181B.svg)](https://arxiv.org/abs/1711.02512)
```
"Fine-tuning CNN Image Retrieval with No Human Annotation",
Radenović F., Tolias G., Chum O.,
TPAMI 2018
```
[![Paper](http://img.shields.io/badge/paper-arXiv.2001.05027-B3181B.svg)](http://arxiv.org/abs/1604.02426)
```
"CNN Image Retrieval Learns from BoW: Unsupervised Fine-Tuning with Hard Examples",
Radenović F., Tolias G., Chum O.,
ECCV 2016
```
Fine-tuned CNNs are used for global feature extraction with the goal of using
those for image retrieval. The networks are trained on the <i>SfM120k</i>
landmark images dataset.
<img src="http://cmp.felk.cvut.cz/cnnimageretrieval/img/cnnimageretrieval_network_medium.png" width=\textwidth/>
When initializing the network, one of the popular pre-trained architectures
for classification tasks (such as ResNet or VGG) is used as the network’s
backbone. The
fully connected layers of such architectures are discarded, resulting in a fully
convolutional backbone. Then, given an input image of the size [W × H × C],
where C is the number of channels, W and H are image width and height,
respectively; the output is a tensor X with dimensions [W' × H' × K], where
K is the number of feature maps in the last layer. Tensor X
can be considered as a set of the input image’s deep local features. For
deep convolutional features, the simple aggregation approach based on global
pooling arguably provides the best results. This method is fast, has a small
number of parameters, and a low risk of overfitting. Keeping this in mind,
we convert local features to a global descriptor vector using one of the
retrieval system’s global poolings (MAC, SPoC, or GeM). After this stage,
the feature vector is made up of the maximum activation per feature map
with dimensionality equal to K. The final output dimensionality for the most
common networks varies from 512 to 2048, making this image representation
relatively compact.
Vectors that have been pooled are subsequently L2-normalized. The obtained
representation is then optionally passed through the fully connected
layers before being subjected to a
new L2 re-normalization. The finally produced image representation allows
comparing the resemblance of two images by simply using their inner product.
### Install DELF library
To be able to use this code, please follow
[these instructions](../../../../INSTALL_INSTRUCTIONS.md) to properly install
the DELF library.
### Usage
<details>
<summary><b>Training</b></summary><br/>
Navigate (```cd```) to the folder ```[DELF_ROOT/delf/python/training
/global_features].```
Example training script is located in ```DELF_ROOT/delf/python/training/global_features/train.py```.
```
python3 train.py [--arch ARCH] [--batch_size N] [--data_root PATH]
[--debug] [--directory PATH] [--epochs N] [--gpu_id ID]
[--image_size SIZE] [--launch_tensorboard] [--loss LOSS]
[--loss_margin LM] [--lr LR] [--momentum M] [multiscale SCALES]
[--neg_num N] [--optimizer OPTIMIZER] [--pool POOL] [--pool_size N]
[--pretrained] [--precompute_whitening DATASET] [--resume]
[--query_size N] [--test_datasets DATASET] [--test_freq N]
[--test_whiten] [--training_dataset DATASET] [--update_every N]
[--validation_type TYPE] [--weight_decay N] [--whitening]
```
For detailed explanation of the options run:
```
python3 train.py -helpfull
```
Standard training of our models was run with the following parameters:
```
python3 train.py \
--directory="DESTINATION_PATH" \
--gpu_ids='0' \
--data_root="TRAINING_DATA_DIRECTORY" \
--training_dataset='retrieval-SfM-120k' \
--test_datasets='roxford5k,rparis6k' \
--arch='ResNet101' \
--pool='gem' \
--whitening=True \
--debug=True \
--loss='triplet' \
--loss_margin=0.85 \
--optimizer='adam' \
--lr=5e-7 --neg_num=3 --query_size=2000 \
--pool_size=20000 --batch_size=5 \
--image_size=1024 --epochs=100 --test_freq=5 \
--multiscale='[1, 2**(1/2), 1/2**(1/2)]'
```
**Note**: Data and networks used for training and testing are automatically downloaded when using the example training
script (```DELF_ROOT/delf/python/training/global_features/train.py```).
</details>
<details>
<summary><b>Training logic flow</b></summary><br/>
**Initialization phase**
1. Checking if required datasets are downloaded and automatically download them (both test and train/val) if they are
not present in the data folder.
1. Setting up the logging and creating a logging/checkpoint directory.
1. Initialize model according to the user-provided parameters (architecture
/pooling/whitening/pretrained etc.).
1. Defining loss (contrastive/triplet) according to the user parameters.
1. Defining optimizer (Adam/SGD with learning rate/weight decay/momentum) according to the user parameters.
1. Initializing CheckpointManager and resuming from the latest checkpoint if the resume flag is set.
1. Launching Tensorboard if the flag is set.
1. Initializing training (and validation, if required) datasets.
1. Freezing BatchNorm weights update, since we we do training for one image at a time so the statistics would not be per batch, hence we choose freezing (i.e., using pretrained imagenet statistics).
1. Evaluating the network performance before training (on the test datasets).
**Training phase**
The main training loop (for the required number of epochs):
1. Finding the hard negative pairs in the dataset (using the forward pass through the model)
1. Creating the training dataset from generator which changes every epoch. Each
element in the dataset consists of 1 x Positive image, 1 x Query image
, N x Hard negative images (N is specified by the `num_neg` flag), an array
specifying the Positive (-1), Query (0), Negative (1) images.
1. Performing one training step and calculating the final epoch loss.
1. If validation is required, finding hard negatives in the validation set
, which has the same structure as the training set. Performing one validation
step and calculating the loss.
1. Evaluating on the test datasets every `test_freq` epochs.
1. Saving checkpoint (optimizer and the model weights).
</details>
## Exporting the Trained Model
Assuming the training output, the TensorFlow checkpoint, is located in the
`--directory` path. The following code exports the model:
```
python3 model/export_CNN_global_model.py \
[--ckpt_path PATH] [--export_path PATH] [--input_scales_list LIST]
[--multi_scale_pool_type TYPE] [--normalize_global_descriptor BOOL]
[arch ARCHITECTURE] [pool POOLING] [whitening BOOL]
```
*NOTE:* Path to the checkpoint must include .h5 file.
## Testing the trained model
After the trained model has been exported, it can be used to extract global
features similarly as for the DELG model. Please follow
[these instructions](https://github.com/tensorflow/models/tree/master/research/delf/delf/python/training#testing-the-trained-model).
After training the standard training setup for 100 epochs, the
following results are obtained on Roxford and RParis datasets under a single
-scale evaluation:
```
>> roxford5k: mAP E: 74.88, M: 58.28, H: 30.4
>> roxford5k: mP@k[1, 5, 10] E: [89.71 84.8 79.07],
M: [91.43 84.67 78.24],
H: [68.57 53.29 43.29]
>> rparis6k: mAP E: 89.21, M: 73.69, H: 49.1
>> rparis6k: mP@k[1, 5, 10] E: [98.57 97.43 95.57],
M: [98.57 99.14 98.14],
H: [94.29 90. 87.29]
```
\ No newline at end of file
# Lint as: python3
# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Export global CNN feature tensorflow inference model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import app
from absl import flags
import tensorflow as tf
from delf.python.training.model import global_model
from delf.python.training.model import export_model_utils
FLAGS = flags.FLAGS
flags.DEFINE_string('ckpt_path', None, help='Path to saved checkpoint.')
flags.DEFINE_string('export_path', None,
help='Path where model will be exported.')
flags.DEFINE_list(
'input_scales_list', None,
'Optional input image scales to use. If None (default), an input '
'end-point '
'"input_scales" is added for the exported model. If not None, the '
'specified list of floats will be hard-coded as the desired input '
'scales.')
flags.DEFINE_enum(
'multi_scale_pool_type', 'None', ['None', 'average', 'sum'],
"If 'None' (default), the model is exported with an output end-point "
"'global_descriptors', where the global descriptor for each scale is "
"returned separately. If not 'None', the global descriptor of each "
"scale is"
' pooled and a 1D global descriptor is returned, with output end-point '
"'global_descriptor'.")
flags.DEFINE_boolean('normalize_global_descriptor', False,
'If True, L2-normalizes global descriptor.')
# Network architecture and initialization options.
flags.DEFINE_string('arch', 'ResNet101',
'model architecture (default: ResNet101)')
flags.DEFINE_string('pool', 'gem', 'pooling options (default: gem)')
flags.DEFINE_boolean('whitening', False,
'train model with learnable whitening (linear layer) '
'after the pooling')
def _NormalizeImages(images, *args):
"""Normalize pixel values in image.
Args:
images: `Tensor`, images to normalize.
Returns:
normalized_images: `Tensor`, normalized images.
"""
tf.keras.applications.imagenet_utils.preprocess_input(images, mode='caffe')
return images
class _ExtractModule(tf.Module):
"""Helper module to build and save global feature model."""
def __init__(self,
multi_scale_pool_type='None',
normalize_global_descriptor=False,
input_scales_tensor=None):
"""Initialization of global feature model.
Args:
multi_scale_pool_type: Type of multi-scale pooling to perform.
normalize_global_descriptor: Whether to L2-normalize global
descriptor.
input_scales_tensor: If None, the exported function to be used
should be ExtractFeatures, where an input end-point "input_scales" is
added for the exported model. If not None, the specified 1D tensor of
floats will be hard-coded as the desired input scales, in conjunction
with ExtractFeaturesFixedScales.
"""
self._multi_scale_pool_type = multi_scale_pool_type
self._normalize_global_descriptor = normalize_global_descriptor
if input_scales_tensor is None:
self._input_scales_tensor = []
else:
self._input_scales_tensor = input_scales_tensor
self._model = global_model.GlobalFeatureNet(
FLAGS.arch, FLAGS.pool, FLAGS.whitening, pretrained=False)
def LoadWeights(self, checkpoint_path):
self._model.load_weights(checkpoint_path)
@tf.function(input_signature=[
tf.TensorSpec(shape=[None, None, 3], dtype=tf.uint8,
name='input_image'),
tf.TensorSpec(shape=[None], dtype=tf.float32, name='input_scales'),
tf.TensorSpec(shape=[None], dtype=tf.int32,
name='input_global_scales_ind')
])
def ExtractFeatures(self, input_image, input_scales,
input_global_scales_ind):
extracted_features = export_model_utils.ExtractGlobalFeatures(
input_image,
input_scales,
input_global_scales_ind,
lambda x: self._model(x, training=False),
multi_scale_pool_type=self._multi_scale_pool_type,
normalize_global_descriptor=self._normalize_global_descriptor,
normalization_function=_NormalizeImages())
named_output_tensors = {}
named_output_tensors['global_descriptors'] = tf.identity(
extracted_features, name='global_descriptors')
return named_output_tensors
@tf.function(input_signature=[
tf.TensorSpec(shape=[None, None, 3], dtype=tf.uint8, name='input_image')
])
def ExtractFeaturesFixedScales(self, input_image):
return self.ExtractFeatures(input_image, self._input_scales_tensor,
tf.range(tf.size(self._input_scales_tensor)))
def main(argv):
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
export_path = FLAGS.export_path
if os.path.exists(export_path):
raise ValueError('export_path %s already exists.' % export_path)
if FLAGS.input_scales_list is None:
input_scales_tensor = None
else:
input_scales_tensor = tf.constant(
[float(s) for s in FLAGS.input_scales_list],
dtype=tf.float32,
shape=[len(FLAGS.input_scales_list)],
name='input_scales')
module = _ExtractModule(FLAGS.multi_scale_pool_type,
FLAGS.normalize_global_descriptor,
input_scales_tensor)
# Load the weights.
checkpoint_path = FLAGS.ckpt_path
module.LoadWeights(checkpoint_path)
print('Checkpoint loaded from ', checkpoint_path)
# Save the module.
if FLAGS.input_scales_list is None:
served_function = module.ExtractFeatures
else:
served_function = module.ExtractFeaturesFixedScales
tf.saved_model.save(
module, export_path, signatures={'serving_default': served_function})
if __name__ == '__main__':
app.run(main)
......@@ -183,7 +183,8 @@ def ExtractGlobalFeatures(image,
global_scales_ind,
model_fn,
multi_scale_pool_type='None',
normalize_global_descriptor=False):
normalize_global_descriptor=False,
normalization_function=gld.NormalizeImages):
"""Extract global features for input image.
Args:
......@@ -201,6 +202,7 @@ def ExtractGlobalFeatures(image,
and a 1D global descriptor is returned.
normalize_global_descriptor: If True, output global descriptors are
L2-normalized.
normalization_function: Function used for normalization.
Returns:
global_descriptors: If `multi_scale_pool_type` is 'None', returns a [S, D]
......@@ -213,7 +215,7 @@ def ExtractGlobalFeatures(image,
"""
original_image_shape_float = tf.gather(
tf.dtypes.cast(tf.shape(image), tf.float32), [0, 1])
image_tensor = gld.NormalizeImages(
image_tensor = normalization_function(
image, pixel_value_offset=128.0, pixel_value_scale=128.0)
image_tensor = tf.expand_dims(image_tensor, 0, name='image/expand_dims')
......
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Generate_SSD_anchor_box_aspect_ratios_using_k_means_clustering.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
......@@ -55,20 +39,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "hCQlBGJkZTR2"
},
"outputs": [],
"source": [
"import tensorflow as tf"
],
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "aw-Ba-5RUhMs"
},
"outputs": [],
"source": [
"# Install the tensorflow Object Detection API...\n",
"# If you're running this offline, you also might need to install the protobuf-compiler:\n",
......@@ -87,9 +73,7 @@
"\n",
"# Test the installation\n",
"! python object_detection/builders/model_builder_tf2_test.py"
],
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "markdown",
......@@ -113,19 +97,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "sKYfhq7CKZ4B"
},
"outputs": [],
"source": [
"%mkdir /content/dataset\n",
"%cd /content/dataset\n",
"! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz\n",
"! wget http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz\n",
"! tar zxf images.tar.gz\n",
"! tar zxf annotations.tar.gz"
],
"execution_count": null,
"outputs": []
"! tar zxf annotations.tar.gz\n",
"\n",
"XML_PATH = '/content/dataset/annotations/xmls'"
]
},
{
"cell_type": "markdown",
......@@ -133,28 +119,53 @@
"id": "44vtL0nsAqXg"
},
"source": [
"In this case, we want to reduce the PETS dataset to match the collection of cats and dogs used to train the model (in [this training notebook](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb)):\n",
"Because the following k-means script will process all XML annotations, we want to reduce the PETS dataset to include only the cats and dogs used to train the model (in [this training notebook](https://colab.sandbox.google.com/github/google-coral/tutorials/blob/master/retrain_ssdlite_mobiledet_qat_tf1.ipynb)). So we delete all annotation files that are **not** Abyssinian or American bulldog:\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "8gcUoBU2K_s7"
"id": "ih48zFbl6jM7"
},
"outputs": [],
"source": [
"! cp /content/dataset/annotations/list.txt /content/dataset/annotations/list_petsdataset.txt\n",
"! cp /content/dataset/annotations/trainval.txt /content/dataset/annotations/trainval_petsdataset.txt\n",
"! cp /content/dataset/annotations/test.txt /content/dataset/annotations/test_petsdataset.txt\n",
"! grep \"Abyssinian\" /content/dataset/annotations/list_petsdataset.txt > /content/dataset/annotations/list.txt\n",
"! grep \"american_bulldog\" /content/dataset/annotations/list_petsdataset.txt >> /content/dataset/annotations/list.txt\n",
"! grep \"Abyssinian\" /content/dataset/annotations/trainval_petsdataset.txt > /content/dataset/annotations/trainval.txt\n",
"! grep \"american_bulldog\" /content/dataset/annotations/trainval_petsdataset.txt >> /content/dataset/annotations/trainval.txt\n",
"! grep \"Abyssinian\" /content/dataset/annotations/test_petsdataset.txt > /content/dataset/annotations/test.txt\n",
"! grep \"american_bulldog\" /content/dataset/annotations/test_petsdataset.txt >> /content/dataset/annotations/test.txt"
],
"! (cd /content/dataset/annotations/xmls/ \u0026\u0026 \\\n",
" find . ! \\( -name 'Abyssinian*' -o -name 'american_bulldog*' \\) -type f -exec rm -f {} \\; )"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "KG8uraCK-RSM"
},
"source": [
"### Upload your own dataset"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "m0bh_iKD-Xz4"
},
"source": [
"To generate the anchor box ratios for your own dataset, upload a ZIP file with your annotation files (click the **Files** tab on the left, and drag-drop your ZIP file there), and then uncomment the following code to unzip it and specify the path to the directory with your annotation files:"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": []
"metadata": {
"id": "M0j_vWDR3WkK"
},
"outputs": [],
"source": [
"# %cd /content/\n",
"# !unzip dataset.zip\n",
"\n",
"# XML_PATH = '/content/dataset/annotations/xmls'"
]
},
{
"cell_type": "markdown",
......@@ -188,23 +199,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "vCB8Dfs0Xlyv"
},
"outputs": [],
"source": [
"import sys\n",
"import glob\n",
"import os\n",
"import numpy as np\n",
"import xml.etree.ElementTree as ET\n",
"\n",
"from sklearn.cluster import KMeans\n",
"\n",
"def xml_to_boxes(path, classes, rescale_width=None, rescale_height=None):\n",
"def xml_to_boxes(path, rescale_width=None, rescale_height=None):\n",
" \"\"\"Extracts bounding-box widths and heights from ground-truth dataset.\n",
"\n",
" Args:\n",
" path : Path to .xml annotation files for your dataset.\n",
" classes : List of classes that are part of dataset.\n",
" rescale_width : Scaling factor to rescale width of bounding box.\n",
" rescale_height : Scaling factor to rescale height of bounding box.\n",
"\n",
......@@ -213,23 +225,20 @@
" \"\"\"\n",
"\n",
" xml_list = []\n",
" for clss in classes:\n",
" for xml_file in glob.glob(path + '/'+clss+'*'):\n",
" if xml_file.endswith('.xml'):\n",
" tree = ET.parse(xml_file)\n",
" root = tree.getroot()\n",
" for member in root.findall('object'):\n",
" bndbox = member.find('bndbox')\n",
" bbox_width = int(bndbox.find('xmax').text) - int(bndbox.find('xmin').text)\n",
" bbox_height = int(bndbox.find('ymax').text) - int(bndbox.find('ymin').text)\n",
" if rescale_width and rescale_height:\n",
" size = root.find('size')\n",
" bbox_width = bbox_width * (rescale_width / int(size.find('width').text))\n",
" bbox_height = bbox_height * (rescale_height / int(size.find('height').text))\n",
"\n",
" xml_list.append([bbox_width, bbox_height])\n",
" else:\n",
" continue\n",
" filenames = os.listdir(os.path.join(path))\n",
" filenames = [os.path.join(path, f) for f in filenames if (f.endswith('.xml'))]\n",
" for xml_file in filenames:\n",
" tree = ET.parse(xml_file)\n",
" root = tree.getroot()\n",
" for member in root.findall('object'):\n",
" bndbox = member.find('bndbox')\n",
" bbox_width = int(bndbox.find('xmax').text) - int(bndbox.find('xmin').text)\n",
" bbox_height = int(bndbox.find('ymax').text) - int(bndbox.find('ymin').text)\n",
" if rescale_width and rescale_height:\n",
" size = root.find('size')\n",
" bbox_width = bbox_width * (rescale_width / int(size.find('width').text))\n",
" bbox_height = bbox_height * (rescale_height / int(size.find('height').text))\n",
" xml_list.append([bbox_width, bbox_height])\n",
" bboxes = np.array(xml_list)\n",
" return bboxes\n",
"\n",
......@@ -275,10 +284,10 @@
" assert len(bboxes), \"You must provide bounding boxes\"\n",
"\n",
" normalized_bboxes = bboxes / np.sqrt(bboxes.prod(axis=1, keepdims=True))\n",
"\n",
" # Using kmeans to find centroids of the width/height clusters\n",
" \n",
" # Using kmeans to find centroids of the width/height clusters\n",
" kmeans = KMeans(\n",
" init='random', n_clusters=num_aspect_ratios,random_state=0, max_iter=kmeans_max_iter)\n",
" init='random', n_clusters=num_aspect_ratios, random_state=0, max_iter=kmeans_max_iter)\n",
" kmeans.fit(X=normalized_bboxes)\n",
" ar = kmeans.cluster_centers_\n",
"\n",
......@@ -292,9 +301,7 @@
" aspect_ratios = [w/h for w,h in ar]\n",
"\n",
" return aspect_ratios, avg_iou_perc"
],
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "markdown",
......@@ -323,13 +330,12 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "cNw-vX3nfl1g"
},
"outputs": [],
"source": [
"classes = ['Abyssinian','american_bulldog']\n",
"xml_path = '/content/dataset/annotations/xmls'\n",
"\n",
"# Tune this based on your accuracy/speed goals as described above\n",
"num_aspect_ratios = 4 # can be [2,3,4,5,6]\n",
"\n",
......@@ -342,8 +348,7 @@
"height = 320\n",
"\n",
"# Get the ground-truth bounding boxes for our dataset\n",
"bboxes = xml_to_boxes(path=xml_path, classes=classes,\n",
" rescale_width=width, rescale_height=height)\n",
"bboxes = xml_to_boxes(path=XML_PATH, rescale_width=width, rescale_height=height)\n",
"\n",
"aspect_ratios, avg_iou_perc = kmeans_aspect_ratios(\n",
" bboxes=bboxes,\n",
......@@ -354,9 +359,7 @@
"\n",
"print('Aspect ratios generated:', [round(ar,2) for ar in aspect_ratios])\n",
"print('Average IOU with anchors:', avg_iou_perc)"
],
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "markdown",
......@@ -378,9 +381,11 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "AlMffd3rgKW2"
},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"from google.protobuf import text_format\n",
......@@ -404,9 +409,7 @@
" f.write(config_text)\n",
"# Check for updated aspect ratios in the config\n",
"!cat /content/ssdlite_mobiledet_edgetpu_320x320_custom_aspect_ratios.config"
],
"execution_count": null,
"outputs": []
]
},
{
"cell_type": "markdown",
......@@ -441,5 +444,22 @@
"\n"
]
}
]
}
\ No newline at end of file
],
"metadata": {
"colab": {
"collapsed_sections": [],
"name": "Generate_SSD_anchor_box_aspect_ratios_using_k_means_clustering.ipynb",
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
......@@ -89,6 +89,7 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
"""
self._num_classes = num_classes
self._groundtruth_lists = {}
self._training_step = None
super(DetectionModel, self).__init__()
......@@ -132,6 +133,13 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
"""
return field in self._groundtruth_lists
@property
def training_step(self):
if self._training_step is None:
raise ValueError('Training step was not provided to the model.')
return self._training_step
@staticmethod
def get_side_inputs(features):
"""Get side inputs from input features.
......@@ -318,7 +326,9 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
groundtruth_verified_neg_classes=None,
groundtruth_not_exhaustive_classes=None,
groundtruth_keypoint_depths_list=None,
groundtruth_keypoint_depth_weights_list=None):
groundtruth_keypoint_depth_weights_list=None,
groundtruth_image_classes=None,
training_step=None):
"""Provide groundtruth tensors.
Args:
......@@ -389,6 +399,11 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
groundtruth_keypoint_depth_weights_list: a list of 2-D tf.float32 tensors
of shape [num_boxes, num_keypoints] containing the weights of the
relative depths.
groundtruth_image_classes: A list of 1-D tf.float32 tensors of shape
[num_classes], containing label indices encoded as k-hot of the classes
that are present or not present in the image.
training_step: An integer denoting the current training step. This is
useful when models want to anneal loss terms.
"""
self._groundtruth_lists[fields.BoxListFields.boxes] = groundtruth_boxes_list
self._groundtruth_lists[
......@@ -463,11 +478,17 @@ class DetectionModel(six.with_metaclass(abc.ABCMeta, _BaseClass)):
self._groundtruth_lists[
fields.InputDataFields
.groundtruth_verified_neg_classes] = groundtruth_verified_neg_classes
if groundtruth_image_classes:
self._groundtruth_lists[
fields.InputDataFields
.groundtruth_image_classes] = groundtruth_image_classes
if groundtruth_not_exhaustive_classes:
self._groundtruth_lists[
fields.InputDataFields
.groundtruth_not_exhaustive_classes] = (
groundtruth_not_exhaustive_classes)
if training_step is not None:
self._training_step = training_step
@abc.abstractmethod
def regularization_losses(self):
......
......@@ -925,7 +925,9 @@ class CenterNetCenterHeatmapTargetAssigner(object):
compute_heatmap_sparse=False,
keypoint_class_id=None,
keypoint_indices=None,
keypoint_weights_for_center=None):
keypoint_weights_for_center=None,
box_heatmap_type='adaptive_gaussian',
heatmap_exponent=1.0):
"""Initializes the target assigner.
Args:
......@@ -947,6 +949,17 @@ class CenterNetCenterHeatmapTargetAssigner(object):
the number of keypoints. The object center is calculated by the weighted
mean of the keypoint locations. If not provided, the object center is
determined by the center of the bounding box (default behavior).
box_heatmap_type: str, the algorithm used to compute the box heatmap,
used when calling the assign_center_targets_from_boxes method.
Options are:
'adaptaive_gaussian': A box-size adaptive Gaussian from the original
paper[1].
'iou': IOU based heatmap target where each point is assigned an IOU
based on its location, assuming that it produced a box centered at
that point with the correct size.
heatmap_exponent: float, The generated heatmap is exponentiated with
this number. A number > 1 will result in the heatmap being more peaky
and a number < 1 will cause the heatmap to be more spreadout.
"""
self._stride = stride
......@@ -955,6 +968,8 @@ class CenterNetCenterHeatmapTargetAssigner(object):
self._keypoint_class_id = keypoint_class_id
self._keypoint_indices = keypoint_indices
self._keypoint_weights_for_center = keypoint_weights_for_center
self._box_heatmap_type = box_heatmap_type
self._heatmap_exponent = heatmap_exponent
def assign_center_targets_from_boxes(self,
height,
......@@ -1018,19 +1033,29 @@ class CenterNetCenterHeatmapTargetAssigner(object):
self._min_overlap)
# Apply the Gaussian kernel to the center coordinates. Returned heatmap
# has shape of [out_height, out_width, num_classes]
heatmap = ta_utils.coordinates_to_heatmap(
y_grid=y_grid,
x_grid=x_grid,
y_coordinates=y_center,
x_coordinates=x_center,
sigma=sigma,
channel_onehot=class_targets,
channel_weights=weights,
sparse=self._compute_heatmap_sparse)
if self._box_heatmap_type == 'adaptive_gaussian':
heatmap = ta_utils.coordinates_to_heatmap(
y_grid=y_grid,
x_grid=x_grid,
y_coordinates=y_center,
x_coordinates=x_center,
sigma=sigma,
channel_onehot=class_targets,
channel_weights=weights,
sparse=self._compute_heatmap_sparse)
elif self._box_heatmap_type == 'iou':
heatmap = ta_utils.coordinates_to_iou(y_grid, x_grid, boxes,
class_targets, weights)
else:
raise ValueError(f'Unknown heatmap type - {self._box_heatmap_type}')
heatmaps.append(heatmap)
# Return the stacked heatmaps over the batch.
return tf.stack(heatmaps, axis=0)
stacked_heatmaps = tf.stack(heatmaps, axis=0)
return (tf.pow(stacked_heatmaps, self._heatmap_exponent) if
self._heatmap_exponent != 1.0 else stacked_heatmaps)
def assign_center_targets_from_keypoints(self,
height,
......
......@@ -1678,6 +1678,66 @@ class CenterNetBoxTargetAssignerTest(test_case.TestCase):
np.testing.assert_array_equal(preds, [[1, 2], [3, 4], [5, 6], [7, 8]])
class CenterNetIOUTargetAssignerTest(test_case.TestCase):
def setUp(self):
super(CenterNetIOUTargetAssignerTest, self).setUp()
self._box_center = [0.0, 0.0, 1.0, 1.0]
self._box_center_small = [0.25, 0.25, 0.75, 0.75]
self._box_lower_left = [0.5, 0.0, 1.0, 0.5]
self._box_center_offset = [0.1, 0.05, 1.0, 1.0]
self._box_odd_coordinates = [0.1625, 0.2125, 0.5625, 0.9625]
def test_center_location(self):
"""Test that the centers are at the correct location."""
def graph_fn():
box_batch = [tf.constant([self._box_center, self._box_lower_left]),
tf.constant([self._box_lower_left, self._box_center])]
classes = [
tf.one_hot([0, 1], depth=4),
tf.one_hot([2, 2], depth=4)
]
assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
4, box_heatmap_type='iou')
targets = assigner.assign_center_targets_from_boxes(
80, 80, box_batch, classes)
return targets
targets = self.execute(graph_fn, [])
self.assertEqual((10, 10), _array_argmax(targets[0, :, :, 0]))
self.assertAlmostEqual(1.0, targets[0, 10, 10, 0])
self.assertEqual((15, 5), _array_argmax(targets[0, :, :, 1]))
self.assertAlmostEqual(1.0, targets[0, 15, 5, 1])
self.assertAlmostEqual(1.0, targets[1, 15, 5, 2])
self.assertAlmostEqual(1.0, targets[1, 10, 10, 2])
self.assertAlmostEqual(0.0, targets[1, 0, 19, 1])
def test_exponent(self):
"""Test that the centers are at the correct location."""
def graph_fn():
box_batch = [tf.constant([self._box_center, self._box_lower_left]),
tf.constant([self._box_lower_left, self._box_center])]
classes = [
tf.one_hot([0], depth=2),
]
assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
1, box_heatmap_type='iou')
targets = assigner.assign_center_targets_from_boxes(
4, 4, box_batch, classes)
assigner = targetassigner.CenterNetCenterHeatmapTargetAssigner(
1, box_heatmap_type='iou', heatmap_exponent=0.5)
targets_pow = assigner.assign_center_targets_from_boxes(
4, 4, box_batch, classes)
return targets, targets_pow
targets, targets_pow = self.execute(graph_fn, [])
self.assertLess(targets[0, 2, 3, 0], 1.0)
self.assertLess(targets_pow[0, 2, 3, 0], 1.0)
self.assertAlmostEqual(targets[0, 2, 3, 0], targets_pow[0, 2, 3, 0] ** 2)
class CenterNetKeypointTargetAssignerTest(test_case.TestCase):
def test_keypoint_heatmap_targets(self):
......
......@@ -10,12 +10,12 @@ devices. It enables on-device machine learning inference with low latency and a
small binary size. TensorFlow Lite uses many techniques for this such as
quantized kernels that allow smaller and faster (fixed-point math) models.
For this section, you will need to build [TensorFlow from
source](https://www.tensorflow.org/install/install_sources) to get the
TensorFlow Lite support for the SSD model. At this time only SSD models are supported.
Models like faster_rcnn are not supported at this time. You will also need to install the
[bazel build
tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel).
For this section, you will need to build
[TensorFlow from source](https://www.tensorflow.org/install/install_sources) to
get the TensorFlow Lite support for the SSD model. At this time only SSD models
are supported. Models like faster_rcnn are not supported at this time. You will
also need to install the
[bazel build tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel).
To make these commands easier to run, let’s set up some environment variables:
......@@ -96,7 +96,17 @@ bazel run -c opt tensorflow/lite/python:tflite_convert -- \
--allow_custom_ops
```
# Running our model on Android
## Adding Metadata to the model
To make it easier to use tflite models on mobile, you will need to add
[metadata](https://www.tensorflow.org/lite/convert/metadata) to your model and
also
[pack](https://www.tensorflow.org/lite/convert/metadata#pack_metadata_and_associated_files_into_the_model)
the associated labels file to it.
If you need more information, this process is also explained in the
[Metadata writer Object detectors documentation](https://www.tensorflow.org/lite/convert/metadata_writer_tutorial#object_detectors)
## Running our model on Android
To run our TensorFlow Lite model on device, we will use Android Studio to build
and run the TensorFlow Lite detection example with the new model. The example is
......@@ -119,8 +129,8 @@ cp /tmp/tflite/detect.tflite \
$TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets
```
You will also need to copy your new labelmap labelmap.txt to the assets
directory.
It's important to notice that the labels file should be packed in the model (as
mentioned previously)
We will now edit the gradle build file to use these assets. First, open the
`build.gradle` file
......@@ -128,17 +138,15 @@ We will now edit the gradle build file to use these assets. First, open the
out the model download script to avoid your assets being overwritten: `// apply
from:'download_model.gradle'` ```
If your model is named `detect.tflite`, and your labels file `labelmap.txt`, the
example will use them automatically as long as they've been properly copied into
the base assets directory. If you need to use a custom path or filename, open up
the
If your model is named `detect.tflite`, the example will use it automatically as
long as they've been properly copied into the base assets directory. If you need
to use a custom path or filename, open up the
$TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
file in a text editor and find the definition of TF_OD_API_LABELS_FILE. Update
this path to point to your new label map file:
"labels_list.txt". Note that if your model is quantized,
the flag TF_OD_API_IS_QUANTIZED is set to true, and if your model is floating
point, the flag TF_OD_API_IS_QUANTIZED is set to false. This new section of
DetectorActivity.java should now look as follows for a quantized model:
file in a text editor and find the definition of TF_OD_API_MODEL_FILE. Note that
if your model is quantized, the flag TF_OD_API_IS_QUANTIZED is set to true, and
if your model is floating point, the flag TF_OD_API_IS_QUANTIZED is set to
false. This new section of DetectorActivity.java should now look as follows for
a quantized model:
```shell
private static final boolean TF_OD_API_IS_QUANTIZED = true;
......
......@@ -92,27 +92,15 @@ converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8,
converter.representative_dataset = <...>
```
### Step 3: Add Metadata
### Step 3: add Metadata to the model
The model needs to be packed with
[TFLite Metadata](https://www.tensorflow.org/lite/convert/metadata) to enable
easy integration into mobile apps using the
[TFLite Task Library](https://www.tensorflow.org/lite/inference_with_metadata/task_library/object_detector).
This metadata helps the inference code perform the correct pre & post processing
as required by the model. Use the following code to create the metadata.
```python
from tflite_support.metadata_writers import object_detector
from tflite_support.metadata_writers import writer_utils
writer = object_detector.MetadataWriter.create_for_inference(
writer_utils.load_file(_TFLITE_MODEL_PATH), input_norm_mean=[0],
input_norm_std=[255], label_file_paths=[_TFLITE_LABEL_PATH])
writer_utils.save_file(writer.populate(), _TFLITE_MODEL_WITH_METADATA_PATH)
```
See the TFLite Metadata Writer API [documentation](https://www.tensorflow.org/lite/convert/metadata_writer_tutorial#object_detectors)
for more details.
To make it easier to use tflite models on mobile, you will need to add
[metadata](https://www.tensorflow.org/lite/convert/metadata) to your model and
also
[pack](https://www.tensorflow.org/lite/convert/metadata#pack_metadata_and_associated_files_into_the_model)
the associated labels file to it.
If you need more information, This process is also explained in the
[Image classification sample](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/metadata)
## Running our model on Android
......@@ -142,9 +130,9 @@ the
that support API >= 21. Additional details are available on the
[TensorFlow Lite example page](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android).
Next we need to point the app to our new detect.tflite file and give it the
names of our new labels. Specifically, we will copy our TensorFlow Lite
model with metadata to the app assets directory with the following command:
Next we need to point the app to our new detect.tflite file . Specifically, we
will copy our TensorFlow Lite flatbuffer to the app assets directory with the
following command:
```shell
mkdir $TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets
......@@ -152,21 +140,30 @@ cp /tmp/tflite/detect.tflite \
$TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/assets
```
It's important to notice that the labels file should be packed in the model (as
mentioned on Step 3)
We will now edit the gradle build file to use these assets. First, open the
`build.gradle` file
`$TF_EXAMPLES/lite/examples/object_detection/android/app/build.gradle`. Comment
out the model download script to avoid your assets being overwritten:
```shell
// apply from:'download_model.gradle'
```
out the model download script to avoid your assets being overwritten: `// apply
from:'download_model.gradle'` ```
If your model is named `detect.tflite`, the example will use it automatically as
long as they've been properly copied into the base assets directory. If you need
to use a custom path or filename, open up the
$TF_EXAMPLES/lite/examples/object_detection/android/app/src/main/java/org/tensorflow/demo/DetectorActivity.java
file in a text editor and find the definition of TF_OD_API_MODEL_FILE. Update
this path to point to your new model file.
file in a text editor and find the definition of TF_OD_API_MODEL_FILE. Note that
if your model is quantized, the flag TF_OD_API_IS_QUANTIZED is set to true, and
if your model is floating point, the flag TF_OD_API_IS_QUANTIZED is set to
false. This new section of DetectorActivity.java should now look as follows for
a quantized model:
```shell
private static final boolean TF_OD_API_IS_QUANTIZED = true;
private static final String TF_OD_API_MODEL_FILE = "detect.tflite";
private static final String TF_OD_API_LABELS_FILE = "labels_list.txt";
```
Once you’ve copied the TensorFlow Lite model and edited the gradle build script
to not use the downloaded assets, you can build and deploy the app using the
......
......@@ -668,7 +668,8 @@ def _get_labels_dict(input_dict):
fields.InputDataFields.groundtruth_dp_surface_coords,
fields.InputDataFields.groundtruth_track_ids,
fields.InputDataFields.groundtruth_verified_neg_classes,
fields.InputDataFields.groundtruth_not_exhaustive_classes
fields.InputDataFields.groundtruth_not_exhaustive_classes,
fields.InputDataFields.groundtruth_image_classes,
]
for key in optional_label_keys:
......
......@@ -12,12 +12,12 @@ import tensorflow as tf
from object_detection.builders import losses_builder
from object_detection.core import box_list
from object_detection.core import box_list_ops
from object_detection.core import losses
from object_detection.core import preprocessor
from object_detection.core import standard_fields as fields
from object_detection.meta_architectures import center_net_meta_arch
from object_detection.models.keras_models import hourglass_network
from object_detection.models.keras_models import resnet_v1
from object_detection.protos import center_net_pb2
from object_detection.protos import losses_pb2
from object_detection.protos import preprocessor_pb2
from object_detection.utils import shape_utils
......@@ -38,46 +38,26 @@ NEIGHBORS_2D = [[-1, -1], [-1, 0], [-1, 1],
[0, -1], [0, 1],
[1, -1], [1, 0], [1, 1]]
WEAK_LOSSES = [DEEP_MASK_BOX_CONSISTENCY, DEEP_MASK_COLOR_CONSISTENCY]
MASK_LOSSES = WEAK_LOSSES + [DEEP_MASK_ESTIMATION]
class DeepMACParams(
collections.namedtuple('DeepMACParams', [
DeepMACParams = collections.namedtuple('DeepMACParams', [
'classification_loss', 'dim', 'task_loss_weight', 'pixel_embedding_dim',
'allowed_masked_classes_ids', 'mask_size', 'mask_num_subsamples',
'use_xy', 'network_type', 'use_instance_embedding', 'num_init_channels',
'predict_full_resolution_masks', 'postprocess_crop_size',
'max_roi_jitter_ratio', 'roi_jitter_mode',
'box_consistency_loss_weight', 'color_consistency_threshold',
'color_consistency_dilation', 'color_consistency_loss_weight'
])):
"""Class holding the DeepMAC network configutration."""
__slots__ = ()
def __new__(cls, classification_loss, dim, task_loss_weight,
pixel_embedding_dim, allowed_masked_classes_ids, mask_size,
mask_num_subsamples, use_xy, network_type, use_instance_embedding,
num_init_channels, predict_full_resolution_masks,
postprocess_crop_size, max_roi_jitter_ratio,
roi_jitter_mode, box_consistency_loss_weight,
color_consistency_threshold, color_consistency_dilation,
color_consistency_loss_weight):
return super(DeepMACParams,
cls).__new__(cls, classification_loss, dim,
task_loss_weight, pixel_embedding_dim,
allowed_masked_classes_ids, mask_size,
mask_num_subsamples, use_xy, network_type,
use_instance_embedding, num_init_channels,
predict_full_resolution_masks,
postprocess_crop_size, max_roi_jitter_ratio,
roi_jitter_mode, box_consistency_loss_weight,
color_consistency_threshold,
color_consistency_dilation,
color_consistency_loss_weight)
def _get_weak_loss_weight(loss_name, config):
if loss_name == DEEP_MASK_COLOR_CONSISTENCY:
'color_consistency_dilation', 'color_consistency_loss_weight',
'box_consistency_loss_normalize', 'box_consistency_tightness',
'color_consistency_warmup_steps', 'color_consistency_warmup_start'
])
def _get_loss_weight(loss_name, config):
if loss_name == DEEP_MASK_ESTIMATION:
return config.task_loss_weight
elif loss_name == DEEP_MASK_COLOR_CONSISTENCY:
return config.color_consistency_loss_weight
elif loss_name == DEEP_MASK_BOX_CONSISTENCY:
return config.box_consistency_loss_weight
......@@ -151,7 +131,7 @@ def _get_deepmac_network_by_type(name, num_init_channels, mask_size=None):
raise ValueError('Mask size must be set.')
return FullyConnectedMaskHead(num_init_channels, mask_size)
elif name == 'embedding_projection':
elif _is_mask_head_param_free(name):
return tf.keras.layers.Lambda(lambda x: x)
elif name.startswith('resnet'):
......@@ -395,6 +375,94 @@ def dilated_cross_same_mask_label(instance_masks, dilation=2):
return tf.transpose(same_mask_prob, (0, 3, 1, 2))
def _per_pixel_single_conv(input_tensor, params, channels):
"""Convolve the given input with the given params.
Args:
input_tensor: A [num_instances, height, width, channels] shaped
float tensor.
params: A [num_instances, num_params] shaped float tensor.
channels: int, number of channels in the convolution.
Returns:
output: A float tensor of shape [num_instances, height, width, channels]
"""
input_channels = input_tensor.get_shape().as_list()[3]
weights = params[:, :(input_channels * channels)]
biases = params[:, (input_channels * channels):]
num_instances = tf.shape(params)[0]
weights = tf.reshape(weights, (num_instances, input_channels, channels))
output = (input_tensor[:, :, tf.newaxis, :] @
weights[:, tf.newaxis, tf.newaxis, :, :])
output = output[:, :, 0, :, :]
output = output + biases[:, tf.newaxis, tf.newaxis, :]
return output
def per_pixel_conditional_conv(input_tensor, parameters, channels, depth):
"""Use parameters perform per-pixel convolutions with the given depth [1].
[1]: https://arxiv.org/abs/2003.05664
Args:
input_tensor: float tensor of shape [num_instances, height,
width, input_channels]
parameters: A [num_instances, num_params] float tensor. If num_params
is incomparible with the given channels and depth, a ValueError will
be raised.
channels: int, the number of channels in the convolution.
depth: int, the number of layers of convolutions to perform.
Returns:
output: A [num_instances, height, width] tensor with the conditional
conv applied according to each instance's parameters.
"""
input_channels = input_tensor.get_shape().as_list()[3]
num_params = parameters.get_shape().as_list()[1]
input_convs = 1 if depth > 1 else 0
intermediate_convs = depth - 2 if depth >= 2 else 0
expected_weights = ((input_channels * channels * input_convs) +
(channels * channels * intermediate_convs) +
channels) # final conv
expected_biases = (channels * (depth - 1)) + 1
if depth == 1:
if input_channels != channels:
raise ValueError(
'When depth=1, input_channels({}) should be equal to'.format(
input_channels) + ' channels({})'.format(channels))
if num_params != (expected_weights + expected_biases):
raise ValueError('Expected {} parameters at depth {}, but got {}'.format(
expected_weights + expected_biases, depth, num_params))
start = 0
output = input_tensor
for i in range(depth):
is_last_layer = i == (depth - 1)
if is_last_layer:
channels = 1
num_params_single_conv = channels * input_channels + channels
params = parameters[:, start:start + num_params_single_conv]
start += num_params_single_conv
output = _per_pixel_single_conv(output, params, channels)
if not is_last_layer:
output = tf.nn.relu(output)
input_channels = channels
return output
class ResNetMaskNetwork(tf.keras.layers.Layer):
"""A small wrapper around ResNet blocks to predict masks."""
......@@ -560,6 +628,16 @@ class DenseResNet(tf.keras.layers.Layer):
return self.out_conv(self.resnet(net))
def _is_mask_head_param_free(name):
# Mask heads which don't have parameters of their own and instead rely
# on the instance embedding.
if name == 'embedding_projection' or name.startswith('cond_inst'):
return True
return False
class MaskHeadNetwork(tf.keras.layers.Layer):
"""Mask head class for DeepMAC."""
......@@ -586,13 +664,14 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
self._use_instance_embedding = use_instance_embedding
self._network_type = network_type
self._num_init_channels = num_init_channels
if (self._use_instance_embedding and
(self._network_type == 'embedding_projection')):
(_is_mask_head_param_free(network_type))):
raise ValueError(('Cannot feed instance embedding to mask head when '
'computing embedding projection.'))
'mask-head has no parameters.'))
if network_type == 'embedding_projection':
if _is_mask_head_param_free(network_type):
self.project_out = tf.keras.layers.Lambda(lambda x: x)
else:
self.project_out = tf.keras.layers.Conv2D(
......@@ -632,6 +711,11 @@ class MaskHeadNetwork(tf.keras.layers.Layer):
instance_embedding = instance_embedding[:, tf.newaxis, tf.newaxis, :]
out = embedding_projection(instance_embedding, out)
elif self._network_type.startswith('cond_inst'):
depth = int(self._network_type.lstrip('cond_inst'))
out = per_pixel_conditional_conv(out, instance_embedding,
self._num_init_channels, depth)
if out.shape[-1] > 1:
out = self.project_out(out)
......@@ -651,6 +735,9 @@ def deepmac_proto_to_params(deepmac_config):
jitter_mode = preprocessor_pb2.RandomJitterBoxes.JitterMode.Name(
deepmac_config.jitter_mode).lower()
box_consistency_loss_normalize = center_net_pb2.LossNormalize.Name(
deepmac_config.box_consistency_loss_normalize).lower()
return DeepMACParams(
dim=deepmac_config.dim,
classification_loss=classification_loss,
......@@ -671,7 +758,14 @@ def deepmac_proto_to_params(deepmac_config):
box_consistency_loss_weight=deepmac_config.box_consistency_loss_weight,
color_consistency_threshold=deepmac_config.color_consistency_threshold,
color_consistency_dilation=deepmac_config.color_consistency_dilation,
color_consistency_loss_weight=deepmac_config.color_consistency_loss_weight
color_consistency_loss_weight=
deepmac_config.color_consistency_loss_weight,
box_consistency_loss_normalize=box_consistency_loss_normalize,
box_consistency_tightness=deepmac_config.box_consistency_tightness,
color_consistency_warmup_steps=
deepmac_config.color_consistency_warmup_steps,
color_consistency_warmup_start=
deepmac_config.color_consistency_warmup_start
)
......@@ -868,6 +962,60 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
return resize_instance_masks(logits, (height, width))
def _aggregate_classification_loss(self, loss, gt, pred, method):
"""Aggregates loss at a per-instance level.
When this function is used with mask-heads, num_classes is usually 1.
Args:
loss: A [num_instances, num_pixels, num_classes] or
[num_instances, num_classes] tensor. If the tensor is of rank 2, i.e.,
of the form [num_instances, num_classes], we will assume that the
number of pixels have already been nornalized.
gt: A [num_instances, num_pixels, num_classes] float tensor of
groundtruths.
pred: A [num_instances, num_pixels, num_classes] float tensor of
preditions.
method: A string in ['auto', 'groundtruth'].
'auto': When `loss` is rank 2, aggregates by sum. Otherwise, aggregates
by mean.
'groundtruth_count': Aggreagates the loss by computing sum and dividing
by the number of positive (1) groundtruth pixels.
'balanced': Normalizes each pixel by the number of positive or negative
pixels depending on the groundtruth.
Returns:
per_instance_loss: A [num_instances] float tensor.
"""
rank = len(loss.get_shape().as_list())
if rank == 2:
axes = [1]
else:
axes = [1, 2]
if method == 'normalize_auto':
normalization = 1.0
if rank == 2:
return tf.reduce_sum(loss, axis=axes)
else:
return tf.reduce_mean(loss, axis=axes)
elif method == 'normalize_groundtruth_count':
normalization = tf.reduce_sum(gt, axis=axes)
return tf.reduce_sum(loss, axis=axes) / normalization
elif method == 'normalize_balanced':
if rank != 3:
raise ValueError('Cannot apply normalized_balanced aggregation '
f'to loss of rank {rank}')
normalization = (
(gt * tf.reduce_sum(gt, keepdims=True, axis=axes)) +
(1 - gt) * tf.reduce_sum(1 - gt, keepdims=True, axis=axes))
return tf.reduce_sum(loss / normalization, axis=axes)
else:
raise ValueError('Unknown loss aggregation - {}'.format(method))
def _compute_per_instance_mask_prediction_loss(
self, boxes, mask_logits, mask_gt):
"""Compute the per-instance mask loss.
......@@ -891,14 +1039,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
target_tensor=mask_gt,
weights=tf.ones_like(mask_logits))
# TODO(vighneshb) Make this configurable via config.
# Skip normalization for dice loss because the denominator term already
# does normalization.
if isinstance(self._deepmac_params.classification_loss,
losses.WeightedDiceClassificationLoss):
return tf.reduce_sum(loss, axis=1)
else:
return tf.reduce_mean(loss, axis=[1, 2])
return self._aggregate_classification_loss(
loss, mask_gt, mask_logits, 'normalize_auto')
def _compute_per_instance_box_consistency_loss(
self, boxes_gt, boxes_for_crop, mask_logits):
......@@ -930,23 +1072,30 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
loss = 0.0
for axis in [1, 2]:
pred_max = tf.reduce_max(pred_crop, axis=axis)[:, :, tf.newaxis]
if self._deepmac_params.box_consistency_tightness:
pred_max_raw = tf.reduce_max(pred_crop, axis=axis)
pred_max_within_box = tf.reduce_max(pred_crop * gt_crop, axis=axis)
box_1d = tf.reduce_max(gt_crop, axis=axis)
pred_max = ((box_1d * pred_max_within_box) +
((1 - box_1d) * pred_max_raw))
else:
pred_max = tf.reduce_max(pred_crop, axis=axis)
pred_max = pred_max[:, :, tf.newaxis]
gt_max = tf.reduce_max(gt_crop, axis=axis)[:, :, tf.newaxis]
axis_loss = self._deepmac_params.classification_loss(
raw_loss = self._deepmac_params.classification_loss(
prediction_tensor=pred_max,
target_tensor=gt_max,
weights=tf.ones_like(pred_max))
loss += axis_loss
# Skip normalization for dice loss because the denominator term already
# does normalization.
# TODO(vighneshb) Make this configurable via config.
if isinstance(self._deepmac_params.classification_loss,
losses.WeightedDiceClassificationLoss):
return tf.reduce_sum(loss, axis=1)
else:
return tf.reduce_mean(loss, axis=[1, 2])
loss += self._aggregate_classification_loss(
raw_loss, gt_max, pred_max,
self._deepmac_params.box_consistency_loss_normalize)
return loss
def _compute_per_instance_color_consistency_loss(
self, boxes, preprocessed_image, mask_logits):
......@@ -995,6 +1144,17 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
num_box_pixels = tf.maximum(1.0, tf.reduce_sum(box_mask, axis=[1, 2]))
loss = loss / num_box_pixels
if ((self._deepmac_params.color_consistency_warmup_steps > 0) and
self._is_training):
training_step = tf.cast(self.training_step, tf.float32)
warmup_steps = tf.cast(
self._deepmac_params.color_consistency_warmup_steps, tf.float32)
start_step = tf.cast(
self._deepmac_params.color_consistency_warmup_start, tf.float32)
warmup_weight = (training_step - start_step) / warmup_steps
warmup_weight = tf.clip_by_value(warmup_weight, 0.0, 1.0)
loss *= warmup_weight
return loss
def _compute_per_instance_deepmac_losses(
......@@ -1084,11 +1244,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
allowed_masked_classes_ids = (
self._deepmac_params.allowed_masked_classes_ids)
loss_dict = {
DEEP_MASK_ESTIMATION: 0.0,
}
for loss_name in WEAK_LOSSES:
loss_dict = {}
for loss_name in MASK_LOSSES:
loss_dict[loss_name] = 0.0
prediction_shape = tf.shape(prediction_dict[INSTANCE_EMBEDDING][0])
......@@ -1148,13 +1305,8 @@ class DeepMACMetaArch(center_net_meta_arch.CenterNetMetaArch):
mask_loss_dict = self._compute_instance_masks_loss(
prediction_dict=prediction_dict)
losses_dict[LOSS_KEY_PREFIX + '/' + DEEP_MASK_ESTIMATION] = (
self._deepmac_params.task_loss_weight * mask_loss_dict[
DEEP_MASK_ESTIMATION]
)
for loss_name in WEAK_LOSSES:
loss_weight = _get_weak_loss_weight(loss_name, self._deepmac_params)
for loss_name in MASK_LOSSES:
loss_weight = _get_loss_weight(loss_name, self._deepmac_params)
if loss_weight > 0.0:
losses_dict[LOSS_KEY_PREFIX + '/' + loss_name] = (
loss_weight * mask_loss_dict[loss_name])
......
"""Tests for google3.third_party.tensorflow_models.object_detection.meta_architectures.deepmac_meta_arch."""
import functools
import random
import unittest
from absl.testing import parameterized
import numpy as np
import tensorflow as tf
from google.protobuf import text_format
from object_detection.core import losses
from object_detection.core import preprocessor
from object_detection.meta_architectures import center_net_meta_arch
from object_detection.meta_architectures import deepmac_meta_arch
from object_detection.protos import center_net_pb2
from object_detection.utils import tf_version
DEEPMAC_PROTO_TEXT = """
dim: 153
task_loss_weight: 5.0
pixel_embedding_dim: 8
use_xy: false
use_instance_embedding: false
network_type: "cond_inst3"
num_init_channels: 8
classification_loss {
weighted_dice_classification_loss {
squared_normalization: false
is_prediction_probability: false
}
}
jitter_mode: EXPAND_SYMMETRIC_XY
max_roi_jitter_ratio: 0.0
predict_full_resolution_masks: true
allowed_masked_classes_ids: [99]
box_consistency_loss_weight: 1.0
color_consistency_loss_weight: 1.0
color_consistency_threshold: 0.1
box_consistency_tightness: false
box_consistency_loss_normalize: NORMALIZE_AUTO
color_consistency_warmup_steps: 20
color_consistency_warmup_start: 10
"""
class DummyFeatureExtractor(center_net_meta_arch.CenterNetFeatureExtractor):
def __init__(self,
......@@ -60,14 +93,37 @@ class MockMaskNet(tf.keras.layers.Layer):
return tf.zeros_like(pixel_embedding[:, :, :, 0]) + 0.9
def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
use_instance_embedding=True, mask_num_subsamples=-1,
network_type='hourglass10', use_xy=True,
pixel_embedding_dim=2,
dice_loss_prediction_probability=False,
color_consistency_threshold=0.5):
def build_meta_arch(**override_params):
"""Builds the DeepMAC meta architecture."""
params = dict(
predict_full_resolution_masks=False,
use_instance_embedding=True,
mask_num_subsamples=-1,
network_type='hourglass10',
use_xy=True,
pixel_embedding_dim=2,
dice_loss_prediction_probability=False,
color_consistency_threshold=0.5,
use_dice_loss=False,
box_consistency_loss_normalize='normalize_auto',
box_consistency_tightness=False,
task_loss_weight=1.0,
color_consistency_loss_weight=1.0,
box_consistency_loss_weight=1.0,
num_init_channels=8,
dim=8,
allowed_masked_classes_ids=[],
mask_size=16,
postprocess_crop_size=128,
max_roi_jitter_ratio=0.0,
roi_jitter_mode='random',
color_consistency_dilation=2,
color_consistency_warmup_steps=0,
color_consistency_warmup_start=0)
params.update(override_params)
feature_extractor = DummyFeatureExtractor(
channel_means=(1.0, 2.0, 3.0),
channel_stds=(10., 20., 30.),
......@@ -87,33 +143,18 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
max_box_predictions=5,
use_labeled_classes=False)
use_dice_loss = params.pop('use_dice_loss')
dice_loss_prediction_prob = params.pop('dice_loss_prediction_probability')
if use_dice_loss:
classification_loss = losses.WeightedDiceClassificationLoss(
squared_normalization=False,
is_prediction_probability=dice_loss_prediction_probability)
is_prediction_probability=dice_loss_prediction_prob)
else:
classification_loss = losses.WeightedSigmoidClassificationLoss()
deepmac_params = deepmac_meta_arch.DeepMACParams(
classification_loss=classification_loss,
dim=8,
task_loss_weight=1.0,
pixel_embedding_dim=pixel_embedding_dim,
allowed_masked_classes_ids=[],
mask_size=16,
mask_num_subsamples=mask_num_subsamples,
use_xy=use_xy,
network_type=network_type,
use_instance_embedding=use_instance_embedding,
num_init_channels=8,
predict_full_resolution_masks=predict_full_resolution_masks,
postprocess_crop_size=128,
max_roi_jitter_ratio=0.0,
roi_jitter_mode='random',
box_consistency_loss_weight=1.0,
color_consistency_threshold=color_consistency_threshold,
color_consistency_dilation=2,
color_consistency_loss_weight=1.0
**params
)
object_detection_params = center_net_meta_arch.ObjectDetectionParams(
......@@ -136,6 +177,15 @@ def build_meta_arch(predict_full_resolution_masks=False, use_dice_loss=False,
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
def test_proto_parse(self):
proto = center_net_pb2.CenterNet().DeepMACMaskEstimation()
text_format.Parse(DEEPMAC_PROTO_TEXT, proto)
params = deepmac_meta_arch.deepmac_proto_to_params(proto)
self.assertIsInstance(params, deepmac_meta_arch.DeepMACParams)
self.assertEqual(params.dim, 153)
self.assertEqual(params.box_consistency_loss_normalize, 'normalize_auto')
def test_subsample_trivial(self):
"""Test subsampling masks."""
......@@ -280,18 +330,126 @@ class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
self.assertAllClose(np.ones((8, 5, 5)), output[:, 1, :, :])
self.assertAllClose([1, 0, 0, 0, 0, 0, 0, 1], output[:, 0, 2, 2])
def test_per_pixel_single_conv_multiple_instance(self):
inp = tf.zeros((5, 32, 32, 7))
params = tf.zeros((5, 7*8 + 8))
out = deepmac_meta_arch._per_pixel_single_conv(inp, params, 8)
self.assertEqual(out.shape, (5, 32, 32, 8))
def test_per_pixel_conditional_conv_error(self):
with self.assertRaises(ValueError):
deepmac_meta_arch.per_pixel_conditional_conv(
tf.zeros((10, 32, 32, 8)), tf.zeros((10, 2)), 8, 3)
def test_per_pixel_conditional_conv_error_tf_func(self):
with self.assertRaises(ValueError):
func = tf.function(deepmac_meta_arch.per_pixel_conditional_conv)
func(tf.zeros((10, 32, 32, 8)), tf.zeros((10, 2)), 8, 3)
def test_per_pixel_conditional_conv_depth1_error(self):
with self.assertRaises(ValueError):
_ = deepmac_meta_arch.per_pixel_conditional_conv(
tf.zeros((10, 32, 32, 7)), tf.zeros((10, 8)), 99, 1)
@parameterized.parameters([
{
'num_input_channels': 7,
'instance_embedding_dim': 8,
'channels': 7,
'depth': 1
},
{
'num_input_channels': 7,
'instance_embedding_dim': 82,
'channels': 9,
'depth': 2
},
{ # From https://arxiv.org/abs/2003.05664
'num_input_channels': 10,
'instance_embedding_dim': 169,
'channels': 8,
'depth': 3
},
{
'num_input_channels': 8,
'instance_embedding_dim': 433,
'channels': 16,
'depth': 3
},
{
'num_input_channels': 8,
'instance_embedding_dim': 1377,
'channels': 32,
'depth': 3
},
{
'num_input_channels': 8,
'instance_embedding_dim': 4801,
'channels': 64,
'depth': 3
},
])
def test_per_pixel_conditional_conv_shape(
self, num_input_channels, instance_embedding_dim, channels, depth):
out = deepmac_meta_arch.per_pixel_conditional_conv(
tf.zeros((10, 32, 32, num_input_channels)),
tf.zeros((10, instance_embedding_dim)), channels, depth)
self.assertEqual(out.shape, (10, 32, 32, 1))
def test_per_pixel_conditional_conv_value_depth1(self):
input_tensor = tf.constant(np.array([1, 2, 3]))
input_tensor = tf.reshape(input_tensor, (1, 1, 1, 3))
instance_embedding = tf.constant(
np.array([1, 10, 100, 1000]))
instance_embedding = tf.reshape(instance_embedding, (1, 4))
out = deepmac_meta_arch.per_pixel_conditional_conv(
input_tensor, instance_embedding, channels=3, depth=1)
expected_output = np.array([1321])
expected_output = np.reshape(expected_output, (1, 1, 1, 1))
self.assertAllClose(expected_output, out)
def test_per_pixel_conditional_conv_value_depth2_single(self):
input_tensor = tf.constant(np.array([2]))
input_tensor = tf.reshape(input_tensor, (1, 1, 1, 1))
instance_embedding = tf.constant(
np.array([-2, 3, 100, 5]))
instance_embedding = tf.reshape(instance_embedding, (1, 4))
out = deepmac_meta_arch.per_pixel_conditional_conv(
input_tensor, instance_embedding, channels=1, depth=2)
expected_output = np.array([5])
expected_output = np.reshape(expected_output, (1, 1, 1, 1))
self.assertAllClose(expected_output, out)
def test_per_pixel_conditional_conv_value_depth2_identity(self):
input_tensor = tf.constant(np.array([1, 2]))
input_tensor = tf.reshape(input_tensor, (1, 1, 1, 2))
instance_embedding = tf.constant(
np.array([1, 0, 0, 1, 1, -3, 5, 100, -9]))
instance_embedding = tf.reshape(
instance_embedding, (1, 9))
out = deepmac_meta_arch.per_pixel_conditional_conv(
input_tensor, instance_embedding, channels=2, depth=2)
expected_output = np.array([1])
expected_output = np.reshape(expected_output, (1, 1, 1, 1))
self.assertAllClose(expected_output, out)
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.parameters(
['hourglass10', 'hourglass20', 'resnet4'])
def test_mask_network(self, head_type):
net = deepmac_meta_arch.MaskHeadNetwork(head_type, 8)
out = net(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
def test_mask_network_params_resnet4(self):
net = deepmac_meta_arch.MaskHeadNetwork('resnet4', num_init_channels=8)
_ = net(tf.zeros((2, 16)), tf.zeros((2, 32, 32, 16)), training=True)
......@@ -301,39 +459,93 @@ class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
self.assertEqual(trainable_params.numpy(), 8665)
def test_mask_network_resnet_tf_function(self):
net = deepmac_meta_arch.MaskHeadNetwork('resnet8')
call_func = tf.function(net.__call__)
out = call_func(tf.zeros((2, 4)), tf.zeros((2, 32, 32, 16)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
def test_mask_network_embedding_projection_zero(self):
def test_mask_network_embedding_projection_small(self):
net = deepmac_meta_arch.MaskHeadNetwork(
'embedding_projection', num_init_channels=8,
'embedding_projection', num_init_channels=-1,
use_instance_embedding=False)
call_func = tf.function(net.__call__)
out = call_func(tf.zeros((2, 7)), tf.zeros((2, 32, 32, 7)), training=True)
out = call_func(1e6 + tf.zeros((2, 7)),
tf.zeros((2, 32, 32, 7)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
self.assertAllGreater(out.numpy(), -np.inf)
self.assertAllLess(out.numpy(), np.inf)
def test_mask_network_embedding_projection_small(self):
@parameterized.parameters([
{
'mask_net': 'resnet4',
'mask_net_channels': 8,
'instance_embedding_dim': 4,
'input_channels': 16,
'use_instance_embedding': False
},
{
'mask_net': 'hourglass10',
'mask_net_channels': 8,
'instance_embedding_dim': 4,
'input_channels': 16,
'use_instance_embedding': False
},
{
'mask_net': 'hourglass20',
'mask_net_channels': 8,
'instance_embedding_dim': 4,
'input_channels': 16,
'use_instance_embedding': False
},
{
'mask_net': 'cond_inst3',
'mask_net_channels': 8,
'instance_embedding_dim': 153,
'input_channels': 8,
'use_instance_embedding': False
},
{
'mask_net': 'cond_inst3',
'mask_net_channels': 8,
'instance_embedding_dim': 169,
'input_channels': 10,
'use_instance_embedding': False
},
{
'mask_net': 'cond_inst1',
'mask_net_channels': 8,
'instance_embedding_dim': 9,
'input_channels': 8,
'use_instance_embedding': False
},
{
'mask_net': 'cond_inst2',
'mask_net_channels': 8,
'instance_embedding_dim': 81,
'input_channels': 8,
'use_instance_embedding': False
},
])
def test_mask_network(self, mask_net, mask_net_channels,
instance_embedding_dim, input_channels,
use_instance_embedding):
net = deepmac_meta_arch.MaskHeadNetwork(
'embedding_projection', num_init_channels=-1,
use_instance_embedding=False)
mask_net, num_init_channels=mask_net_channels,
use_instance_embedding=use_instance_embedding)
call_func = tf.function(net.__call__)
out = call_func(1e6 + tf.zeros((2, 7)),
tf.zeros((2, 32, 32, 7)), training=True)
out = call_func(tf.zeros((2, instance_embedding_dim)),
tf.zeros((2, 32, 32, input_channels)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
self.assertAllGreater(out.numpy(), -np.inf)
self.assertAllLess(out.numpy(), np.inf)
out = call_func(tf.zeros((2, instance_embedding_dim)),
tf.zeros((2, 32, 32, input_channels)), training=True)
self.assertEqual(out.shape, (2, 32, 32))
out = call_func(tf.zeros((0, instance_embedding_dim)),
tf.zeros((0, 32, 32, input_channels)), training=True)
self.assertEqual(out.shape, (0, 32, 32))
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
......@@ -619,8 +831,85 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
xloss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.constant([1.0] * 16),
logits=[1.0] * 12 + [0.0] * 4)
yloss_mean = tf.reduce_mean(yloss)
xloss_mean = tf.reduce_mean(xloss)
self.assertAllClose(loss, [yloss_mean + xloss_mean])
def test_box_consistency_loss_with_tightness(self):
boxes_gt = tf.constant([[0., 0., 0.49, 0.49]])
boxes_jittered = None
mask_prediction = np.zeros((1, 8, 8)).astype(np.float32) - 1e10
mask_prediction[0, :4, :4] = 1e10
self.assertAllClose(loss, [tf.reduce_mean(yloss + xloss).numpy()])
model = build_meta_arch(box_consistency_tightness=True,
predict_full_resolution_masks=True)
loss = model._compute_per_instance_box_consistency_loss(
boxes_gt, boxes_jittered, tf.constant(mask_prediction))
self.assertAllClose(loss, [0.0])
def test_box_consistency_loss_gt_count(self):
boxes_gt = tf.constant([
[0., 0., 1.0, 1.0],
[0., 0., 0.49, 0.49]])
boxes_jittered = None
mask_prediction = np.zeros((2, 32, 32)).astype(np.float32)
mask_prediction[0, :16, :16] = 1.0
mask_prediction[1, :8, :8] = 1.0
model = build_meta_arch(
box_consistency_loss_normalize='normalize_groundtruth_count',
predict_full_resolution_masks=True)
loss_func = tf.function(
model._compute_per_instance_box_consistency_loss)
loss = loss_func(
boxes_gt, boxes_jittered, tf.constant(mask_prediction))
yloss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.constant([1.0] * 32),
logits=[1.0] * 16 + [0.0] * 16) / 32.0
yloss_mean = tf.reduce_sum(yloss)
xloss = yloss
xloss_mean = tf.reduce_sum(xloss)
self.assertAllClose(loss[0], yloss_mean + xloss_mean)
yloss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.constant([1.0] * 16 + [0.0] * 16),
logits=[1.0] * 8 + [0.0] * 24) / 16.0
yloss_mean = tf.reduce_sum(yloss)
xloss = yloss
xloss_mean = tf.reduce_sum(xloss)
self.assertAllClose(loss[1], yloss_mean + xloss_mean)
def test_box_consistency_loss_balanced(self):
boxes_gt = tf.constant([
[0., 0., 0.49, 0.49]])
boxes_jittered = None
mask_prediction = np.zeros((1, 32, 32)).astype(np.float32)
mask_prediction[0] = 1.0
model = build_meta_arch(box_consistency_loss_normalize='normalize_balanced',
predict_full_resolution_masks=True)
loss_func = tf.function(
model._compute_per_instance_box_consistency_loss)
loss = loss_func(
boxes_gt, boxes_jittered, tf.constant(mask_prediction))
yloss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=[0.] * 16 + [1.0] * 16,
logits=[1.0] * 32)
yloss_mean = tf.reduce_sum(yloss) / 16.0
xloss_mean = yloss_mean
self.assertAllClose(loss[0], yloss_mean + xloss_mean)
def test_box_consistency_dice_loss(self):
......@@ -701,34 +990,145 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
loss = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0)
for weak_loss in deepmac_meta_arch.WEAK_LOSSES:
for weak_loss in deepmac_meta_arch.MASK_LOSSES:
if weak_loss == deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY:
continue
self.assertGreater(loss['Loss/' + weak_loss], 0.0,
'{} was <= 0'.format(weak_loss))
def test_loss_keys_full_res(self):
model = build_meta_arch(use_dice_loss=True,
predict_full_resolution_masks=True)
def test_loss_weight_response(self):
model = build_meta_arch(
use_dice_loss=True,
predict_full_resolution_masks=True,
network_type='cond_inst1',
dim=9,
pixel_embedding_dim=8,
use_instance_embedding=False,
use_xy=False)
num_stages = 1
prediction = {
'preprocessed_inputs': tf.random.normal((1, 32, 32, 3)),
'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 17))] * 2,
'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 19))] * 2,
'object_center': [tf.random.normal((1, 8, 8, 6))] * 2,
'box/offset': [tf.random.normal((1, 8, 8, 2))] * 2,
'box/scale': [tf.random.normal((1, 8, 8, 2))] * 2
'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 9))] * num_stages,
'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 8))] * num_stages,
'object_center': [tf.random.normal((1, 8, 8, 6))] * num_stages,
'box/offset': [tf.random.normal((1, 8, 8, 2))] * num_stages,
'box/scale': [tf.random.normal((1, 8, 8, 2))] * num_stages
}
boxes = [tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)]
classes = [tf.one_hot([1, 0, 1, 1, 1], depth=6)]
weights = [tf.ones(5)]
masks = [tf.ones((5, 32, 32))]
model.provide_groundtruth(
groundtruth_boxes_list=[tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)],
groundtruth_classes_list=[tf.one_hot([1, 0, 1, 1, 1], depth=6)],
groundtruth_weights_list=[tf.ones(5)],
groundtruth_masks_list=[tf.ones((5, 32, 32))])
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks)
loss = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0)
for weak_loss in deepmac_meta_arch.WEAK_LOSSES:
self.assertGreater(loss['Loss/' + weak_loss], 0.0,
'{} was <= 0'.format(weak_loss))
for mask_loss in deepmac_meta_arch.MASK_LOSSES:
self.assertGreater(loss['Loss/' + mask_loss], 0.0,
'{} was <= 0'.format(mask_loss))
rng = random.Random(0)
loss_weights = {
deepmac_meta_arch.DEEP_MASK_ESTIMATION: rng.uniform(1, 5),
deepmac_meta_arch.DEEP_MASK_BOX_CONSISTENCY: rng.uniform(1, 5),
deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY: rng.uniform(1, 5)
}
weighted_model = build_meta_arch(
use_dice_loss=True,
predict_full_resolution_masks=True,
network_type='cond_inst1',
dim=9,
pixel_embedding_dim=8,
use_instance_embedding=False,
use_xy=False,
task_loss_weight=loss_weights[deepmac_meta_arch.DEEP_MASK_ESTIMATION],
box_consistency_loss_weight=(
loss_weights[deepmac_meta_arch.DEEP_MASK_BOX_CONSISTENCY]),
color_consistency_loss_weight=(
loss_weights[deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY]))
weighted_model.provide_groundtruth(
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks)
weighted_loss = weighted_model.loss(prediction, tf.constant([[32, 32, 3]]))
for mask_loss in deepmac_meta_arch.MASK_LOSSES:
loss_key = 'Loss/' + mask_loss
self.assertAllEqual(
weighted_loss[loss_key], loss[loss_key] * loss_weights[mask_loss],
f'{mask_loss} did not respond to change in weight.')
def test_color_consistency_warmup(self):
model = build_meta_arch(
use_dice_loss=True,
predict_full_resolution_masks=True,
network_type='cond_inst1',
dim=9,
pixel_embedding_dim=8,
use_instance_embedding=False,
use_xy=False,
color_consistency_warmup_steps=10,
color_consistency_warmup_start=10)
num_stages = 1
prediction = {
'preprocessed_inputs': tf.random.normal((1, 32, 32, 3)),
'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 9))] * num_stages,
'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 8))] * num_stages,
'object_center': [tf.random.normal((1, 8, 8, 6))] * num_stages,
'box/offset': [tf.random.normal((1, 8, 8, 2))] * num_stages,
'box/scale': [tf.random.normal((1, 8, 8, 2))] * num_stages
}
boxes = [tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)]
classes = [tf.one_hot([1, 0, 1, 1, 1], depth=6)]
weights = [tf.ones(5)]
masks = [tf.ones((5, 32, 32))]
model.provide_groundtruth(
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks,
training_step=5)
loss_at_5 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
model.provide_groundtruth(
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks,
training_step=15)
loss_at_15 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
model.provide_groundtruth(
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks,
training_step=20)
loss_at_20 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
model.provide_groundtruth(
groundtruth_boxes_list=boxes,
groundtruth_classes_list=classes,
groundtruth_weights_list=weights,
groundtruth_masks_list=masks,
training_step=100)
loss_at_100 = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
loss_key = 'Loss/' + deepmac_meta_arch.DEEP_MASK_COLOR_CONSISTENCY
self.assertAlmostEqual(loss_at_5[loss_key].numpy(), 0.0)
self.assertAlmostEqual(loss_at_15[loss_key].numpy(),
loss_at_20[loss_key].numpy() / 2.0)
self.assertAlmostEqual(loss_at_20[loss_key].numpy(),
loss_at_100[loss_key].numpy())
@unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
......
......@@ -114,6 +114,10 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
'groundtruth_not_exhaustive_classes': [batch_size, num_classes] K-hot
representation of 1-indexed classes which don't have all of their
instances marked exhaustively.
'input_data_fields.groundtruth_image_classes': integer representation of
the classes that were sent for verification for a given image. Note that
this field does not support batching as the number of classes can be
variable.
class_agnostic: Boolean indicating whether detections are class agnostic.
"""
input_data_fields = fields.InputDataFields()
......@@ -136,6 +140,18 @@ def _prepare_groundtruth_for_eval(detection_model, class_agnostic,
input_data_fields.groundtruth_classes: groundtruth_classes
}
if detection_model.groundtruth_has_field(
input_data_fields.groundtruth_image_classes):
groundtruth_image_classes_k_hot = tf.stack(
detection_model.groundtruth_lists(
input_data_fields.groundtruth_image_classes))
# We do not add label_id_offset here because it was not added when encoding
# groundtruth_image_classes.
groundtruth_image_classes = tf.expand_dims(
tf.where(groundtruth_image_classes_k_hot > 0)[:, 1], 0)
groundtruth[
input_data_fields.groundtruth_image_classes] = groundtruth_image_classes
if detection_model.groundtruth_has_field(fields.BoxListFields.masks):
groundtruth[input_data_fields.groundtruth_instance_masks] = tf.stack(
detection_model.groundtruth_lists(fields.BoxListFields.masks))
......@@ -303,7 +319,7 @@ def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True):
return unbatched_tensor_dict
def provide_groundtruth(model, labels):
def provide_groundtruth(model, labels, training_step=None):
"""Provides the labels to a model as groundtruth.
This helper function extracts the corresponding boxes, classes,
......@@ -313,6 +329,8 @@ def provide_groundtruth(model, labels):
Args:
model: The detection model to provide groundtruth to.
labels: The labels for the training or evaluation inputs.
training_step: int, optional. The training step for the model. Useful
for models which want to anneal loss weights.
"""
gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
......@@ -382,6 +400,10 @@ def provide_groundtruth(model, labels):
if fields.InputDataFields.groundtruth_not_exhaustive_classes in labels:
gt_not_exhaustive_classes = labels[
fields.InputDataFields.groundtruth_not_exhaustive_classes]
groundtruth_image_classes = None
if fields.InputDataFields.groundtruth_image_classes in labels:
groundtruth_image_classes = labels[
fields.InputDataFields.groundtruth_image_classes]
model.provide_groundtruth(
groundtruth_boxes_list=gt_boxes_list,
groundtruth_classes_list=gt_classes_list,
......@@ -402,7 +424,9 @@ def provide_groundtruth(model, labels):
groundtruth_verified_neg_classes=gt_verified_neg_classes,
groundtruth_not_exhaustive_classes=gt_not_exhaustive_classes,
groundtruth_keypoint_depths_list=gt_keypoint_depths_list,
groundtruth_keypoint_depth_weights_list=gt_keypoint_depth_weights_list)
groundtruth_keypoint_depth_weights_list=gt_keypoint_depth_weights_list,
groundtruth_image_classes=groundtruth_image_classes,
training_step=training_step)
def create_model_fn(detection_model_fn, configs, hparams=None, use_tpu=False,
......
......@@ -51,7 +51,7 @@ RESTORE_MAP_ERROR_TEMPLATE = (
def _compute_losses_and_predictions_dicts(
model, features, labels,
model, features, labels, training_step=None,
add_regularization_loss=True):
"""Computes the losses dict and predictions dict for a model on inputs.
......@@ -107,6 +107,7 @@ def _compute_losses_and_predictions_dicts(
float32 tensor containing keypoint depths information.
labels[fields.InputDataFields.groundtruth_keypoint_depth_weights] is a
float32 tensor containing the weights of the keypoint depth feature.
training_step: int, the current training step.
add_regularization_loss: Whether or not to include the model's
regularization loss in the losses dictionary.
......@@ -116,7 +117,7 @@ def _compute_losses_and_predictions_dicts(
`model.predict`.
"""
model_lib.provide_groundtruth(model, labels)
model_lib.provide_groundtruth(model, labels, training_step=training_step)
preprocessed_images = features[fields.InputDataFields.image]
prediction_dict = model.predict(
......@@ -166,7 +167,8 @@ def _ensure_model_is_built(model, input_dataset, unpad_groundtruth_tensors):
labels = model_lib.unstack_batch(
labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)
return _compute_losses_and_predictions_dicts(model, features, labels)
return _compute_losses_and_predictions_dicts(model, features, labels,
training_step=0)
strategy = tf.compat.v2.distribute.get_strategy()
if hasattr(tf.distribute.Strategy, 'run'):
......@@ -208,6 +210,7 @@ def eager_train_step(detection_model,
labels,
unpad_groundtruth_tensors,
optimizer,
training_step,
add_regularization_loss=True,
clip_gradients_value=None,
num_replicas=1.0):
......@@ -280,6 +283,7 @@ def eager_train_step(detection_model,
float32 tensor containing the weights of the keypoint depth feature.
unpad_groundtruth_tensors: A parameter passed to unstack_batch.
optimizer: The training optimizer that will update the variables.
training_step: int, the training step number.
add_regularization_loss: Whether or not to include the model's
regularization loss in the losses dictionary.
clip_gradients_value: If this is present, clip the gradients global norm
......@@ -302,7 +306,9 @@ def eager_train_step(detection_model,
with tf.GradientTape() as tape:
losses_dict, _ = _compute_losses_and_predictions_dicts(
detection_model, features, labels, add_regularization_loss)
detection_model, features, labels,
training_step=training_step,
add_regularization_loss=add_regularization_loss)
losses_dict = normalize_dict(losses_dict, num_replicas)
......@@ -632,6 +638,7 @@ def train_loop(
labels,
unpad_groundtruth_tensors,
optimizer,
training_step=global_step,
add_regularization_loss=add_regularization_loss,
clip_gradients_value=clip_gradients_value,
num_replicas=strategy.num_replicas_in_sync)
......@@ -901,7 +908,8 @@ def eager_eval_loop(
labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)
losses_dict, prediction_dict = _compute_losses_and_predictions_dicts(
detection_model, features, labels, add_regularization_loss)
detection_model, features, labels, training_step=None,
add_regularization_loss=add_regularization_loss)
prediction_dict = detection_model.postprocess(
prediction_dict, features[fields.InputDataFields.true_image_shape])
eval_features = {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment